rm(list = ls())
if(!require(statnet)) install.packages("statnet",repos = "http://cran.us.r-project.org")
library("statnet")
if(!require(mice)) install.packages("mice",repos = "http://cran.us.r-project.org")
library("mice")
if(!require(texreg)) install.packages("texreg",repos = "http://cran.us.r-project.org")
library("texreg")
if(!require(xergm)) install.packages("xergm",repos = "http://cran.us.r-project.org")
library("xergm")
if(!require(ggplot2)) install.packages("ggplot2",repos = "http://cran.us.r-project.org")
library("ggplot2")
if(!require(kableExtra)) install.packages("kableExtra",repos = "http://cran.us.r-project.org")
library("kableExtra")
if(!require(GGally)) install.packages("GGally",repos = "http://cran.us.r-project.org")
library("GGally")
if(!require(scales)) install.packages("scales",repos = "http://cran.us.r-project.org")
library("scales")
options(width = 300)
We are going to use this document for the next two tutorials. The topics we are going to discuss are:
Before we start, this is a list of resources which we have found useful when learning about ERG models, running them, and finding about more comprehensive ways of using them.
R
package we are going to use: LinkTo begin let’s load the just as we did last time. In this one long instruction we are going to load the data and create the network object.
## read data
el <- read.table('Edgelist_3Classes_Fall.csv', sep = ";",
header = TRUE)
dt <- read.table('Students_attributes_short.csv', sep = ";",
header = TRUE)
## create adjacency matrix
students <- unique(c(el$studentID, el$friend.ID.code))
mat <- matrix(0, nrow =length(students) , ncol = length(students))
colnames(mat) <- as.character(students)
rownames(mat) <- as.character(students)
for (i in 1:nrow(el)) {
row.index <- which(students == el[i, 1])
col.index <- which(students == el[i, 2])
mat[row.index, col.index] <- 1
}
## create network object
nw <- network(mat, directed = TRUE)
Now, just as we did last time, we are going to add some node covariates. Some of them have missing values, so for the sake of complete data, we are going use the same technique from last time (mice
). I’m hiding the output from this chunk.
## add some attributes
identical(rownames(mat), dt$studentID)
att <- data.frame('studentID' = rownames(mat))
#impute some missing values for numeric variables
imp <- cbind(dt$studentID,
dt$marks.overall.num,
dt$drink.week.total,
dt$risk.beh,
dt$relig.part.num)
imp <- mice(imp)
imp <- complete(imp)
This is just to check that the imputation is working as intended, i.e., replacing the presence of NA
s with observations.
# For the grades
summary(dt$marks.overall.num)
summary(imp$V2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.500 4.500 5.000 4.821 5.000 5.500 4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.500 4.500 5.000 4.826 5.000 5.500
# For the amount of drinks
summary(dt$drink.week.total)
summary(imp$V3)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 4.000 5.829 8.000 33.000 2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 4.000 5.628 7.500 33.000
# For the risky behaviour
summary(dt$risk.beh)
summary(imp$V4)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.333 2.333 2.137 2.833 4.000 4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.333 2.333 2.140 2.833 4.000
# and for the religious attendance
summary(dt$relig.part.num)
summary(imp$V5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.359 3.000 7.000 4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.442 3.000 7.000
Now, onto the variables. This time we will include more information because we want to make sure our statistical approach meets the our model. We should be familiarised with the data by now, so we are not going to look at it variable by variable, just load them all in the attributes of the network.
The variables we are including are:
# 0. class
att$class <- dt$class[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'class', as.character(att$class))
# 1. Gender
att <- data.frame('studentID' = rownames(mat))
att$sex <- dt$sex[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'sex', as.character(att$sex))
# 2. marks (= school grades, the Swiss way)
att$grade.overall <- imp$V2[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'grade', att$grade.overall)
# 3. stress.school.num (higher = more stress)
att$stress.school <- dt$stress.school.num[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'stress', att$stress.school)
# 4. life.satis.num (higher = happier)
att$life.satis.num <- dt$life.satis.num[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'life.sat', att$life.satis.num)
# 5. healthy.body.image (1 = healthy body image, 0 = not happy with body/weight/bmi, think they need to change even though their weight is normal)
att$healthy.body.image <- dt$healthy.body.image[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'body.image', as.character(att$healthy.body.image))
# 6. drink.week.total (nr of glasses)
att$drink.week.total <- imp$V3[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'drink', att$drink.week.total)
# 7. sport.hours (nr of sport hours)
att$sport.hours <- dt$sport.hours[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'sport', att$sport.hours)
# 8. risk behavior (higher = more inclined to take risks (drive drunk, no seatbelt, no helmet))
att$risk.beh <- imp$V4[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'risk', att$risk.beh)
# 9. college bound (1 = heading for university/college)
att$college.bound <- dt$college.bound[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'college.bound', as.character(att$college.bound))
# 10. relig.part.num (higher = more often attending religious services)
att$relig.part.num <- imp$V5[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'relig', att$relig.part.num)
# 11. being a gamer or not (1 = gaming more than 2hr a week)
att$gaming <- dt$games[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'gaming', att$gaming)
# 12. watching TV (hr watching tv)
att$hr.tv <- dt$hr.tv[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'tvhours', att$hr.tv)
# 13. liking school
att$liking.school <- dt$liking.school[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'likingschool', as.character(att$liking.school))
Remember that you can also create node attributes in the following way:
nw %v% 'likingschool' = as.character(att$liking.school)
You will find detailed information about all the available model terms by running the following command. This section is designed to give an overview of the most common ERG model terms in the literature. Make sure that you keep the help section in mind when doing social network analysis. If you would prefer to read a good summary of the terms, this is a useful resource: https://rdrr.io/cran/ergm/man/ergm-terms.html
# ERGM terms help section
?'ergm-terms'
The simplest ERG model you can run involves only an edges
-term. It is basically mandatory for every ERGM you estimate - just like the intercept in a simple linear regression. We call this the null-model.
This simple model only controls for the density of the network. In other words, it makes sure that all the simulated networks have the same density.
We are going to include the whole output from the instruction in the first examples, or when we consider there is something of interest worth mentioning. Without more introduction, let’s run the first ERG model.
fit0 <- ergm(nw ~ edges)
The same summary
function that helps produce output tables from regression models works for ERGMs. Let’s see what we can read from this:
summary(fit0)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ edges
##
## Iterations: 6 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -2.52394 0.08978 0 -28.11 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 954.9 on 1805 degrees of freedom
##
## AIC: 956.9 BIC: 962.4 (Smaller is better.)
Similarly to logistical models, the exponent of coef
transform the log-odds into odds ratios. Dividing by 1 + exp(coef)
gives you the probability of a tie. In our example:
# We can export the value of the coefficient using the object created by the ergm function:
# We run the `coef` function on the fitted model, and ask for the column with the covariate of interest
coef0 <- as.numeric(coef(fit0)['edges'])
# The probability
exp(coef0)/(1 + exp(coef0))
## [1] 0.07419712
The probability of a tie should correspond to the number of edges divided by the number possible dyads in the network. We alreday know the number of edges. To calculate the number of potential dyads we simply \(n*(n-1)\), where \(n\) is the number of nodes in the network.
# Number of edges using the 'summary function':
e <- as.numeric(summary(nw ~ edges))
# Number of nodes
n <- length(att$studentID)
# Edge probability:
e/(n*(n-1))
## [1] 0.07419712
A brief note on model term selection
Unlike traditional regression models that can be analised by adding more and more covariates, ERG models should be run with all the covariates of interest at the same time. Especially when specifying endogenous terms.
If you try to fit a model assuming the presence of triads is the only variable of importance, the model might not converge. It might not be able to find improved models and become degenerate. It is a better alternative to add degree-distributions and other model terms to your triadic closure variables so that the model terms together make it easier to find and maximize the likelihood.
Said that, let us build the ERGM through small increments only for didactic purposes.
Estimators resulting from logistical regressions are given as odds ratios. This is, the odds of an event happening for one group divided by the odds of a thing happening for another group.
Suppose that the probability of an event occurring is \(P\). The odds of that event occurring are \(P/(1-P)\). The odds ratio of that event happening over a different one, are \(\displaystyle{\frac{P/(1-P)}{Q/(1-Q)}}\), where \(Q\) is the probability of ocurrence of the second event. Odds ratios, are NOT the same thing as relative risk, but in extreme cases, where probabilities are very low, these two values can be very similar.
Interpreting odds ratios should keep this small note into consideration. There is a larger conversation in the academic community regarding interpreting odds ratios, and whether or not they should be used in publication of results.
These are some links that we have found useful in this discussion:
The mutual()
term is used in directed networks to indicate how many ties are reciprocated.
In the case of the two classes, 34 of the 134 total ties are mutual.
# We can use the same function from above
summary(nw ~ mutual)
## mutual
## 34
You can test each ERGM term using the summary()
command.
Let us test whether there are more mutual ties than expected in our network. To do this we need to use the aptly name ergm
function from the ergm
package. A useful workflow for these (and most regression exercise as well) is to define the model, and then feed that model into the function. This will allow more control when dealing with increasingly complicated models and functions.
m1 <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
edges) # Edges term, which should always be included, so we leave it at the end
fit1 <- ergm(m1)
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 0.002538.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 0.002514.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC. To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.
summary(fit1)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## mutual 3.2312 0.2975 0 10.86 <1e-04 ***
## edges -3.1929 0.1262 0 -25.31 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 848.6 on 1804 degrees of freedom
##
## AIC: 852.6 BIC: 863.6 (Smaller is better.)
In this particular model, the number of mutual ties is compared to generated random networks with the same density. The effect is positive and significant, indicating there are more mutual links in our network than what one would expect from a random network with 134 edges.
You can add attributes to your model terms to see nodes with different values. Let’s explore how the differences in gender affects the reciprocity of friendships:
m1b <- as.formula( nw ~ # The network
mutual('sex', diff = TRUE) + # How many ties are reciprocated?
edges) # Edges term, which should always be included, so we leave it at the end
fit1b <- ergm(m1b)
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 1.76.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 0.1853.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC. To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.
summary(fit1b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ mutual("sex", diff = TRUE) + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## mutual.same.sex.female 3.8141 0.3499 0 10.900 <1e-04 ***
## mutual.same.sex.male 3.1376 0.3563 0 8.806 <1e-04 ***
## edges -2.9712 0.1127 0 -26.355 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2504 on 1806 degrees of freedom
## Residual Deviance: 853 on 1803 degrees of freedom
##
## AIC: 859 BIC: 875.5 (Smaller is better.)
Using the output from the ergm
function, female students have a slightly higher propensity to engage in mutual behavior. Overall, the effects are quite similar for the two groups.
This model term checks whether the number of nodes with a specific degree is more or less likely than compared to a random network. Let’s see how many nodes have an in/out-degree between 0 and 10.
summary(nw ~ idegree(0:10))
## idegree0 idegree1 idegree2 idegree3 idegree4 idegree5 idegree6 idegree7 idegree8 idegree9 idegree10
## 3 8 8 9 4 4 3 3 1 0 0
summary(nw ~ odegree(0:10))
## odegree0 odegree1 odegree2 odegree3 odegree4 odegree5 odegree6 odegree7 odegree8 odegree9 odegree10
## 4 8 7 7 5 6 1 5 0 0 0
m2a <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
idegree(1:7) + # in-degree for degrees 0-7
edges) # Edges term, which should always be included, so we leave it at the end
fit2a <- ergm(m2a)
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 1.127.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 0.06881.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC. To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.
summary(fit2a)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ mutual + idegree(1:7) + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## mutual 3.2234 0.2991 0 10.777 <1e-04 ***
## idegree1 -0.2732 0.6452 0 -0.424 0.6719
## idegree2 -0.7799 0.6305 0 -1.237 0.2161
## idegree3 -0.7556 0.6021 0 -1.255 0.2095
## idegree4 -1.3161 0.7178 0 -1.833 0.0667 .
## idegree5 -0.8493 0.7563 0 -1.123 0.2614
## idegree6 -0.4387 0.8603 0 -0.510 0.6101
## idegree7 0.4467 0.9146 0 0.488 0.6253
## edges -3.1816 0.1566 0 -20.321 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 841.5 on 1797 degrees of freedom
##
## AIC: 859.5 BIC: 909 (Smaller is better.)
What about out-degree?
m2b <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
odegree(1:7) + # in-degree for degrees 0-7
edges) # Edges term, which should always be included, so we leave it at the end
fit2b <- ergm(m2b)
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 1.235.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 20:
## Optimizing with step length 1.
## The log-likelihood improved by 0.02896.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC. To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.
summary(fit2b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ mutual + odegree(1:7) + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## mutual 3.2090 0.3006 0 10.68 <1e-04 ***
## odegree1 1.4053 NA NA NA NA
## odegree2 2.7213 NA NA NA NA
## odegree3 4.5899 NA NA NA NA
## odegree4 6.4370 NA NA NA NA
## odegree5 9.0524 NA NA NA NA
## odegree6 9.8870 NA NA NA NA
## odegree7 14.3300 NA NA NA NA
## edges -5.1244 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 833.5 on 1797 degrees of freedom
##
## AIC: 851.5 BIC: 901 (Smaller is better.)
The inclusion of several out-degree measures cancels out the statistical significance of the edges
term because of multicollinearity - the two measures reflect the same information in the network.
A possible alternative to idegree
and odegree
is gwdegree()
. This is a more appealing way to measure the in- or outdegree distributions. It is considered a more stable term. The gw
in the gwdegree()
stands for ‘geometrically weighted’ in the estimation of the degree distribution. It measures a node’s tendency to have multiple outgoing or incoming ties. The weights its referring to are applied to the increasing number of connections each node has. As such, one of the parameters (the \(\alpha\) value) of the term is the decay in weight: Values close to 0 give more relative weight to smaller degree-counts. We can explore different weights to see how they change the estimation.
A good way of determining which \(\alpha\) value we should use is by looking at the Bayesian Information Criteria (BIC) from the resulting regressions. Note: remember that smaller BIC-values indicates a better model fit.
Graphical illustration of the gwesp()
term.
The commands below, and the resulting outputs, show the distribution of in-degree and out-degree for the network above. Note: gwd_nw
is the name of the drawn network.
# Geometrically weighted indegree
summary(gwd_nw ~ gwidegree(0:10))
## gwidegree#1 gwidegree#2 gwidegree#3 gwidegree#4 gwidegree#5 gwidegree#6 gwidegree#7
## 4 3 0 1 0 0 0
# Geometrically weighted outdegree
summary(gwd_nw ~ gwodegree(0:10))
## gwodegree#1 gwodegree#2 gwodegree#3 gwodegree#4 gwodegree#5 gwodegree#6 gwodegree#7
## 3 4 1 0 0 0 0
m3a <- as.formula( nw ~ # The network
gwodegree(0.5, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 0.5
gwidegree(0.5, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 0.5
edges) # Edges term, which should always be included, so we leave it at the end
m3b <- as.formula( nw ~ # The network
gwodegree(1, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 1
gwidegree(1, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 1
edges) # Edges term, which should always be included, so we leave it at the end
m3c <- as.formula( nw ~ # The network
gwodegree(2, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 2
gwidegree(2, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 2
edges) # Edges term, which should always be included, so we leave it at the end
fit3a <- ergm(m3a)
fit3b <- ergm(m3b)
fit3c <- ergm(m3c)
We can compare the results from the three estimations using the screenreg()
function from the texreg
package.
screenreg(list(fit3a, fit3b, fit3c))
##
## ======================================================
## Model 1 Model 2 Model 3
## ------------------------------------------------------
## gwodeg.fixed.0.5 -1.24 **
## (0.46)
## gwideg.fixed.0.5 -0.94
## (0.48)
## edges -2.16 *** -1.81 *** -0.65
## (0.14) (0.20) (0.48)
## gwodeg.fixed.1 -1.21 **
## (0.42)
## gwideg.fixed.1 -1.00 *
## (0.45)
## gwodeg.fixed.2 -1.52 **
## (0.54)
## gwideg.fixed.2 -1.37 *
## (0.56)
## ------------------------------------------------------
## AIC 951.83 949.87 950.67
## BIC 968.32 966.37 967.17
## Log Likelihood -472.91 -471.93 -472.34
## ======================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05
Interpretation of these terms are not very intuitive - they mostly serve the purpose of control variables and of stabilising the model. However, in rough terms:
In general, it speaks of an even or uneven distribution of degrees throughout a network, in comparison to a random network.
This Shiny App by Michael Levy has a simple simulation that shows the effect of different values of a gwdegree
parameter, as well as other useful ERG model information.
The following set of terms measures the number of triads in the network and compares them to the number of triads in random networks. They have a tendency to produce model degeneracy
. This is, that the simulations based on the triads term produce networks too dissimilar from the observed networks.
In the words of Hunter:
“When a model is not a good representation of the observed network, these simulated networks may be far enough away from the observed network that the estimation process is affected. In the worst case scenario, the simulated networks will be so different that the algorithm fails altogether.”
Degeneracy is a feature of how well does the suggested model compares to the observed network. Fortunately there are some alternatives to the using these terms, such as gwesp()
(See more on this later in the document).
The following chunk of text shows the number of triangles (triangle
), transitive triples (\(a \rightarrow b \rightarrow c \leftarrow a\); ttriples
) , and cyclic triples (\(a \rightarrow b \rightarrow c \rightarrow a\); ctriple
).
summary(nw ~ triangle + ttriple + ctriple)
## triangle ttriple ctriple
## 216 176 40
Notice that the number of transitive triples and cyclic triples sums to the number of triangles.
The number of appereances of these terms should warrant including them in the model estimations. However, if we run a simple ERGM with only the triangle
term, we see that the estimation does not converge:
# A model that does not converge:
m4 <- as.formula(nw ~ # The network
triangle + # The term to include triangles
edges) # Edges term, which should always be included, so we leave it at the end
fit4 <- ergm(m4)
## Error in ergm.MCMLE(init, nw, model, initialfit = (initialfit <- NULL), : Unconstrained MCMC sampling did not mix at all. Optimization cannot continue.
There are some cases for which we should include triad statistics. Consider for example information sharing networks, where the direction of how the information is flowing is important for the estimated results. Morris, Handcock and Hunter (2008)’s Specification of Exponential-Family Random Graph Models: Terms and Computational Aspects has some more information on how to use and include these terms.
As your models become more and more complicated you will encounter problems with convergence of the MCMC routine. You can tweak the control statistics (as above) to see if that makes things better. Increasing the MCMC burn-in gives the MCMC chain more steps to converge. If in the MCMC diagnostics (printed at the bottom) the samples differ significantly from the observed network, you should increase the MCMC.samplesize
. There are other techniques you could use, but this is a whole field of literature which we do not really have time in this short course to get into.
Make sure that you are always keeping the theoretical model in mind. Debugging a faulty estimation should not be a matter of fishing for the parameter which will make the model run effectively. However, consider that if nothing else works, your model might be misspecified. Go back a few steps, use different covariates that make theoretical sense or try different model terms.
But before we despair: It is a common result in the estimation literature that these endogenous terms only work once you’ve incorporated other exogenous variables into your model. ERG models work best when you have included all model terms that your theory of the world says affect the data generating process of your network. If your model refuses to converge try specifying it with all your independent and control variables and start your debugging from there. Again,
All of the model terms we discussed previously deal with the outgoing and incoming connections of the nodes in the network. This following section looks at node covariates as terms in the ERGM equation.
nodefactor
We are going to explore the differences in the way connections are made between nodes given categorical covariate. We use the nodefactor()
term for this.
m7 <- as.formula(nw ~
nodefactor('sex') + # Difference in connections conditional on gender
mutual +
edges) # edges term
fit7 <- ergm(m7)
summary(fit7)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodefactor("sex") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodefactor.sex.male -0.03073 0.10616 0 -0.289 0.772
## mutual 3.22794 0.29996 0 10.761 <1e-04 ***
## edges -3.16146 0.16505 0 -19.155 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 849.1 on 1803 degrees of freedom
##
## AIC: 855.1 BIC: 871.6 (Smaller is better.)
Results suggest that there is not a statistically significant difference between how men and women make connections in this particular network. Neither group has a higher probability of forming an edge. Notice that in the case of a node covariate we can use the local interpretation of the logit regression in the ERG model. With the mutual
term we used the global interpretation: we discussed the probability of something occuring more often in our entire network compared to random networks.
summary(fit7)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodefactor("sex") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodefactor.sex.male -0.03073 0.10616 0 -0.289 0.772
## mutual 3.22794 0.29996 0 10.761 <1e-04 ***
## edges -3.16146 0.16505 0 -19.155 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 849.1 on 1803 degrees of freedom
##
## AIC: 855.1 BIC: 871.6 (Smaller is better.)
In contrast, the local interpretaion is the same as for any logit regression model: exp(gender) =
-0.03 indicates that the odds of a man forming a tie versus a woman forming a tie are lower (if they were significant!).
In the case of directed networks, we can run the same estimation differentiating between in-degree and out-degree to determine how the attributes affects the correlation:
m7b <- as.formula(nw ~
nodeofactor('sex') + # Difference in outgoing connections conditional on gender
nodeifactor('sex') + # Difference in incoming connections conditional on gender
mutual +
edges)
fit7b <- ergm(m7b)
summary(fit7b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodeofactor("sex") + nodeifactor("sex") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodeofactor.sex.male -0.02984 0.20164 0 -0.148 0.882
## nodeifactor.sex.male -0.03541 0.20468 0 -0.173 0.863
## mutual 3.23081 0.29980 0 10.777 <1e-04 ***
## edges -3.16085 0.16875 0 -18.731 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 848.8 on 1802 degrees of freedom
##
## AIC: 856.8 BIC: 878.8 (Smaller is better.)
Well now, since gender does not seem to affect (incoming or outgoing) friendship formation rates, let’s see if other categorical variables matter. The variables we include here reflect the model we hypothesise determines the formation of ties.
m8 <- as.formula(nw ~
nodefactor('sex') + # Incoming AND outgoing gender
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('likingschool') + # Outgoing likes school
nodeifactor('likingschool') + # Incoming likes school
nodeofactor('gaming') + # Outgoing considered a gamer?
nodeifactor('gaming') + # Incoming considered a gamer?
mutual +
edges)
fit8 <- ergm(m8)
summary(fit8)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodefactor("sex") + nodeofactor("college.bound") + nodeifactor("college.bound") +
## nodeofactor("likingschool") + nodeifactor("likingschool") +
## nodeofactor("gaming") + nodeifactor("gaming") + mutual +
## edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodefactor.sex.male 0.002667 0.123924 0 0.022 0.9828
## nodeofactor.college.bound.1 -0.077226 0.231374 0 -0.334 0.7386
## nodeifactor.college.bound.1 -0.472448 0.230990 0 -2.045 0.0408 *
## nodeofactor.likingschool.yes, a bit 0.298858 0.279043 0 1.071 0.2842
## nodeofactor.likingschool.yes, very 0.911411 0.358044 0 2.546 0.0109 *
## nodeifactor.likingschool.yes, a bit -0.169229 0.268810 0 -0.630 0.5290
## nodeifactor.likingschool.yes, very 0.237629 0.365234 0 0.651 0.5153
## nodeofactor.gaming.1 0.197720 0.218963 0 0.903 0.3665
## nodeifactor.gaming.1 0.029173 0.218705 0 0.133 0.8939
## mutual 3.201924 0.317320 0 10.091 <1e-04 ***
## edges -3.239747 0.320722 0 -10.101 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 833.5 on 1795 degrees of freedom
##
## AIC: 855.5 BIC: 916 (Smaller is better.)
There are two results worth mentioning in this output table:
College-bound students have less incoming connections than non-college bound students. This can be seen in the negative coefficient of nodeifactor.college.bound.1
. The probability of receiving a friendship tie from another student is around 0.4 higher than non-college bound students.
Students who said they enjoy school very much nominate slightly more friends compared to students who do not like school (answered ‘no’ in the question - the base case). The coefficient is positive and significant (nodeofactor.likingschool.yes, very
), representing an increase in the odds of nominating another friend by a factor of 2.49 (exp( 0.91)).
NOTE: nodefactor()
terms ALWAYS uses a baseline category. You can change the baseline category using the option base = NUMBER
:
m8b <- as.formula(nw ~
nodefactor('sex') + # Incoming AND outgoing gender
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('likingschool', base = -3) + # Outgoing likes school
nodeifactor('likingschool', base = 3) + # Incoming likes school
nodeofactor('gaming') + # Outgoing considered a gamer?
nodeifactor('gaming') + # Incoming considered a gamer?
mutual +
edges)
fit8b <- ergm(m8b)
summary(fit8b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodefactor("sex") + nodeofactor("college.bound") + nodeifactor("college.bound") +
## nodeofactor("likingschool", base = -3) + nodeifactor("likingschool",
## base = 3) + nodeofactor("gaming") + nodeifactor("gaming") +
## mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodefactor.sex.male -0.01046 0.12293 0 -0.085 0.9322
## nodeofactor.college.bound.1 -0.13250 0.22354 0 -0.593 0.5533
## nodeifactor.college.bound.1 -0.45115 0.23263 0 -1.939 0.0525 .
## nodeofactor.likingschool.yes, very 0.71024 0.30348 0 2.340 0.0193 *
## nodeifactor.likingschool.no -0.30822 0.35775 0 -0.862 0.3889
## nodeifactor.likingschool.yes, a bit -0.35800 0.34012 0 -1.053 0.2925
## nodeofactor.gaming.1 0.23169 0.21755 0 1.065 0.2869
## nodeifactor.gaming.1 0.03156 0.21908 0 0.144 0.8855
## mutual 3.18206 0.30700 0 10.365 <1e-04 ***
## edges -2.77293 0.38639 0 -7.177 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 834.9 on 1796 degrees of freedom
##
## AIC: 854.9 BIC: 909.9 (Smaller is better.)
nodecov
Now let’s look at nodecov()
. It is used for continuous variables. This will allow us to check whether students with higher grades have a tendency to nominate more friends or receive more friendship ties.
m9 <- as.formula(nw ~
nodecov('grade') +
mutual +
edges)
fit9 <- ergm(m9)
summary(fit9)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodecov("grade") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodecov.grade -0.09055 0.13620 0 -0.665 0.5062
## mutual 3.21298 0.30144 0 10.659 <1e-04 ***
## edges -2.31908 1.31720 0 -1.761 0.0783 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 848.2 on 1803 degrees of freedom
##
## AIC: 854.2 BIC: 870.7 (Smaller is better.)
This is not the case. Let’s see if there are differentes for sending or receiving friendships:
m9b <- as.formula(nw ~
nodeicov('grade') +
nodeocov('grade') +
mutual +
edges)
fit9b <- ergm(m9b)
summary(fit9b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodeicov("grade") + nodeocov("grade") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodeicov.grade -0.5369 0.2581 0 -2.080 0.0375 *
## nodeocov.grade 0.3956 0.2797 0 1.414 0.1572
## mutual 3.2860 0.3059 0 10.743 <1e-04 ***
## edges -2.5413 1.3723 0 -1.852 0.0640 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 844.9 on 1802 degrees of freedom
##
## AIC: 852.9 BIC: 874.9 (Smaller is better.)
The restuls seem quite different now: There is a statistically signficant difference for incoming ties associated to having higher grades. This is a way in which we can use the fact that we are dealing with a directed network to extract as much information as possible from the model.
We will explore the differential effect this variable has when we include the gender of the students further down. We need to include a few other variables before we do so.
m10 <- as.formula(nw ~
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('likingschool') + # Incoming Like school
nodeofactor('likingschool') + # Incoming Like school
nodeicov('grade') + # Incoming Grades
nodeocov('grade') + # Outgoing Grades
nodeocov('stress') + # Outgoing Stress level
nodeicov('stress') + # Incoming Stress level
nodeocov('risk') + # Outgoing Risky behaviour
nodeicov('risk') + # Incoming Risky behaviour
nodeocov('tvhours') + # Outgoing hours watching tv
nodeicov('tvhours') + # Incoming hours watching tv
mutual +
edges)
fit10 <- ergm(m10)
summary(fit10)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodeifactor("college.bound") + nodeofactor("college.bound") +
## nodeifactor("likingschool") + nodeofactor("likingschool") +
## nodeicov("grade") + nodeocov("grade") + nodeocov("stress") +
## nodeicov("stress") + nodeocov("risk") + nodeicov("risk") +
## nodeocov("tvhours") + nodeicov("tvhours") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodeifactor.college.bound.1 -0.440428 0.234466 0 -1.878 0.0603 .
## nodeofactor.college.bound.1 -0.128489 0.239991 0 -0.535 0.5924
## nodeifactor.likingschool.yes, a bit -0.119945 0.258462 0 -0.464 0.6426
## nodeifactor.likingschool.yes, very 0.326900 0.358161 0 0.913 0.3614
## nodeofactor.likingschool.yes, a bit 0.245795 0.282632 0 0.870 0.3845
## nodeofactor.likingschool.yes, very 0.743668 0.360332 0 2.064 0.0390 *
## nodeicov.grade -0.463063 0.276297 0 -1.676 0.0937 .
## nodeocov.grade 0.251511 0.298803 0 0.842 0.3999
## nodeocov.stress 0.006816 0.091871 0 0.074 0.9409
## nodeicov.stress -0.074563 0.092885 0 -0.803 0.4221
## nodeocov.risk -0.175969 0.134702 0 -1.306 0.1914
## nodeicov.risk 0.101284 0.139378 0 0.727 0.4674
## nodeocov.tvhours -0.014148 0.034850 0 -0.406 0.6848
## nodeicov.tvhours -0.002643 0.034837 0 -0.076 0.9395
## mutual 3.291454 0.315280 0 10.440 <1e-04 ***
## edges -1.733572 1.597927 0 -1.085 0.2780
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 827.1 on 1790 degrees of freedom
##
## AIC: 859.1 BIC: 947.1 (Smaller is better.)
According to these results, neither stress level, risk behavior or hours spent watching television affect incoming or outgoing tie formation.
nodematch
So far we have covered which lets us look at nodefactor
which lets us include categorical variables and nodecov
which lets us look at continuous variables. We are now going to use nodematch
, which helps us measure whether students have a tendency to nominate friends with whom they share a particular attribute. In other words, it measures homophily in a network based on a categorical variable.
m11 <- as.formula(nw ~
nodematch('class') + # Match between students of the same class
nodefactor('class') + # But also, does class have an effect in the nomination?
mutual +
edges)
fit11 <- ergm(m11)
summary(fit11)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class") + nodefactor("class") + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class 2.21332 0.25308 0 8.745 <1e-04 ***
## nodefactor.class.15gD 0.06452 0.09586 0 0.673 0.501
## nodefactor.class.15gE 0.04342 0.11380 0 0.382 0.703
## mutual 2.38153 0.32212 0 7.393 <1e-04 ***
## edges -4.39971 0.25514 0 -17.244 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 714.1 on 1801 degrees of freedom
##
## AIC: 724.1 BIC: 751.6 (Smaller is better.)
These results (unsurprisingly) suggest that in this network there is a high statistically significant probability of nomimating someone when they are from the same class as you.
Note: The use of the nodematch()
term should always be done while also including the nodefactor()
with the same variable. The reason for this is controlling skewed distributions in your factor variable. As a simple example: consider a network where 90% of the nodes are blue, and 10% are red. Even in the case of a random network, you would expect there to me more ties between blue nodes than between blue nodes and red nodes. Adding the nodefactor()
term helps control for this overrepresentation of possible ties between nodes that share an attribute.
Including the diff = TRUE
in the nodematch()
term you can look at the homophily pattern for each unique value in your categorical variable: m12
. It is possible to exclude certain values by using the keep = XX
option: m12b
.
m12 <- as.formula(nw ~
nodematch('class', diff = TRUE) + # Making the difference between classes explicit
nodefactor('class') + # But also, does class have an effect in the nomination?
mutual +
edges)
fit12 <- ergm(m12)
summary(fit12)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class", diff = TRUE) + nodefactor("class") +
## mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class.15gC 3.5329 0.7679 0 4.601 <1e-04 ***
## nodematch.class.15gD 1.5258 0.7864 0 1.940 0.0524 .
## nodematch.class.15gE 1.8975 0.7994 0 2.374 0.0176 *
## nodefactor.class.15gD 1.0423 0.6217 0 1.676 0.0937 .
## nodefactor.class.15gE 0.8291 0.5263 0 1.575 0.1152
## mutual 2.3717 0.3189 0 7.438 <1e-04 ***
## edges -5.6807 0.7485 0 -7.590 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 709.8 on 1799 degrees of freedom
##
## AIC: 723.8 BIC: 762.3 (Smaller is better.)
m12b <- as.formula(nw ~
nodematch('class', diff = TRUE, keep = 1) + # Keeping the first class in the estimation
nodefactor('class') + # But controlling for the class distribution
mutual +
edges)
fit12b <- ergm(m12b)
summary(fit12b)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class", diff = TRUE, keep = 1) + nodefactor("class") +
## mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class.15gC 4.5536 0.7192 0 6.331 <1e-04 ***
## nodefactor.class.15gD 2.1574 0.3650 0 5.911 <1e-04 ***
## nodefactor.class.15gE 1.9254 0.3715 0 5.182 <1e-04 ***
## mutual 2.6597 0.3151 0 8.440 <1e-04 ***
## edges -6.8082 0.6870 0 -9.910 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 755.6 on 1801 degrees of freedom
##
## AIC: 765.6 BIC: 793.1 (Smaller is better.)
We can extend this same analysis to different variables to see how they affect the homophily in the network:
# Notice how we keep the 'match' and 'class' terms in the same row to keep track of them
m13 <- as.formula(nw ~
nodematch('class') + nodefactor("class") +
nodematch('sex') + nodefactor('sex') +
nodematch('college.bound') + nodeifactor('college.bound') +
nodematch('body.image') + nodeifactor('body.image') + nodeofactor('body.image') +
nodematch('gaming') + nodeifactor('gaming') + nodeofactor('gaming') +
nodematch('likingschool', diff = TRUE) + nodeofactor('likingschool') +
mutual +
edges)
fit13 <- ergm(m13, control = control.ergm(seed = 12345))
summary(fit13)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class") + nodefactor("class") + nodematch("sex") +
## nodefactor("sex") + nodematch("college.bound") + nodeifactor("college.bound") +
## nodematch("body.image") + nodeifactor("body.image") + nodeofactor("body.image") +
## nodematch("gaming") + nodeifactor("gaming") + nodeofactor("gaming") +
## nodematch("likingschool", diff = TRUE) + nodeofactor("likingschool") +
## mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class 2.4306 0.2737 0 8.879 < 1e-04 ***
## nodefactor.class.15gD 0.1405 0.1149 0 1.222 0.22157
## nodefactor.class.15gE 0.2761 0.1365 0 2.023 0.04312 *
## nodematch.sex 1.2343 0.2263 0 5.453 < 1e-04 ***
## nodefactor.sex.male -0.2751 0.1612 0 -1.707 0.08776 .
## nodematch.college.bound 0.1665 0.1880 0 0.886 0.37560
## nodeifactor.college.bound.1 -0.5108 0.2280 0 -2.240 0.02507 *
## nodematch.body.image 0.1405 0.1886 0 0.745 0.45622
## nodeifactor.body.image.1 -0.2719 0.2329 0 -1.168 0.24294
## nodeofactor.body.image.1 0.2848 0.2284 0 1.247 0.21246
## nodematch.gaming 0.2569 0.1987 0 1.293 0.19615
## nodeifactor.gaming.1 0.2611 0.2547 0 1.025 0.30522
## nodeofactor.gaming.1 0.5649 0.2587 0 2.183 0.02900 *
## nodematch.likingschool.no -0.6784 0.7341 0 -0.924 0.35543
## nodematch.likingschool.yes, a bit -0.4467 0.2731 0 -1.636 0.10184
## nodematch.likingschool.yes, very -0.1141 0.9655 0 -0.118 0.90595
## nodeofactor.likingschool.yes, a bit 0.5956 0.3785 0 1.573 0.11563
## nodeofactor.likingschool.yes, very 1.1817 0.3913 0 3.020 0.00253 **
## mutual 2.0857 0.3424 0 6.092 < 1e-04 ***
## edges -5.7918 0.4971 0 -11.652 < 1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 638.2 on 1786 degrees of freedom
##
## AIC: 678.2 BIC: 788.2 (Smaller is better.)
Being in the same class has increases the probability of a connection being made between two nodes, just as we saw before. However, we can also claim that gender plays an important rule in tie formation for the students in this high school. The nodematch
term for gender is positive and strong, indicating strong tendencies for friendships to build students of the same gender.
nodemix
We can explore categorical variables in a lot more detail using the nodemix
term. This term is used to determine how likely it is that a tie is formed between nodes that exhibit one value of the categorical variable with those of a different value of the same variable. Let’s use the case of likingschool
as an example. In the model below, we have the following term:
nodemix('likingschool', base = c(1))
This means that we are looking at all the different combinations of likingschool
, considering the first one in the list to be the base over which all the other ones compare to. The best way to see in which order they occur, we first run the model without the base =
option. With the full list of combinations it is easier to determine which one we would like to consider the base. The entire list of options is listed below.
unique(get.node.attr(nw = nw, attrname = "likingschool"))
## [1] "no" "yes, a bit" "yes, very"
From it we can say that no-no is the first combination, which we consider the base in m15
with the nodemix
term.
m15 <- as.formula(nw ~
nodematch('class') + nodefactor('class') +
nodematch('sex') + nodefactor('sex') +
nodeifactor('college.bound') +
nodeofactor('gaming') +
nodemix('likingschool', base = c(1)) +
mutual +
edges)
fit15 <- ergm(m15)
summary(fit15)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class") + nodefactor("class") + nodematch("sex") +
## nodefactor("sex") + nodeifactor("college.bound") + nodeofactor("gaming") +
## nodemix("likingschool", base = c(1)) + mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class 2.4259 0.2682 0 9.044 <1e-04 ***
## nodefactor.class.15gD 0.1660 0.1157 0 1.435 0.1514
## nodefactor.class.15gE 0.3034 0.1339 0 2.265 0.0235 *
## nodematch.sex 1.3377 0.2128 0 6.286 <1e-04 ***
## nodefactor.sex.male -0.2109 0.1305 0 -1.616 0.1060
## nodeifactor.college.bound.1 -0.5442 0.2372 0 -2.295 0.0218 *
## nodeofactor.gaming.1 0.5808 0.2530 0 2.296 0.0217 *
## mix.likingschool.yes, a bit.no 1.0322 0.7380 0 1.399 0.1619
## mix.likingschool.yes, very.no 1.7956 0.8709 0 2.062 0.0392 *
## mix.likingschool.no.yes, a bit 0.6565 0.7479 0 0.878 0.3800
## mix.likingschool.yes, a bit.yes, a bit 0.8272 0.7007 0 1.180 0.2378
## mix.likingschool.yes, very.yes, a bit 1.6970 0.7639 0 2.221 0.0263 *
## mix.likingschool.no.yes, very 0.9598 0.9359 0 1.026 0.3051
## mix.likingschool.yes, a bit.yes, very 1.5577 0.7761 0 2.007 0.0447 *
## mix.likingschool.yes, very.yes, very 1.8176 1.2233 0 1.486 0.1373
## mutual 2.0302 0.3443 0 5.896 <1e-04 ***
## edges -6.1589 0.7880 0 -7.816 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 643.1 on 1789 degrees of freedom
##
## AIC: 677.1 BIC: 770.6 (Smaller is better.)
You can see that students who enjoy school very much have a tendency to make friends with other students, regardless of how much they enjoy school themselves. This results corroborates what we found when using the nodeofactor('likingschool')
term before: students who love school are a bit more social in comparison to those who do not.
absdiff
The term that we are going to describe in this next section is in a category of its own: that of dyadic covariates. This term deals with absolute difference effects. The intuition behind it is similar to using nodematch()
, but for continuous variables. It explores whether two nodes form a tie because they both have similar values (i.e., smaller difference) or because they do not have similar values (i.e., larger difference). Let’s use the following model as an example:
m16 <- as.formula(nw ~
nodematch('class') + nodefactor('class') +
nodematch('sex') + nodefactor('sex') +
nodeifactor('college.bound') +
nodeofactor('gaming') +
nodeofactor('likingschool') +
absdiff('grade') +
absdiff('drink') +
absdiff('life.sat') +
absdiff('risk') +
absdiff('stress') +
absdiff('sport') +
absdiff('relig') +
absdiff('tvhours') +
mutual +
edges)
fit16 <- ergm(m16)
summary(fit16)
##
## ==========================
## Summary of model fit
## ==========================
##
## Formula: nw ~ nodematch("class") + nodefactor("class") + nodematch("sex") +
## nodefactor("sex") + nodeifactor("college.bound") + nodeofactor("gaming") +
## nodeofactor("likingschool") + absdiff("grade") + absdiff("drink") +
## absdiff("life.sat") + absdiff("risk") + absdiff("stress") +
## absdiff("sport") + absdiff("relig") + absdiff("tvhours") +
## mutual + edges
##
## Iterations: 2 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## nodematch.class 2.468449 0.271065 0 9.106 < 1e-04 ***
## nodefactor.class.15gD 0.180488 0.121959 0 1.480 0.138899
## nodefactor.class.15gE 0.305658 0.141335 0 2.163 0.030569 *
## nodematch.sex 1.381203 0.213071 0 6.482 < 1e-04 ***
## nodefactor.sex.male -0.198510 0.139235 0 -1.426 0.153949
## nodeifactor.college.bound.1 -0.345249 0.222075 0 -1.555 0.120030
## nodeofactor.gaming.1 0.563179 0.254250 0 2.215 0.026756 *
## nodeofactor.likingschool.yes, a bit 0.563662 0.298514 0 1.888 0.058996 .
## nodeofactor.likingschool.yes, very 1.382289 0.383753 0 3.602 0.000316 ***
## absdiff.grade -0.234529 0.261327 0 -0.897 0.369476
## absdiff.drink -0.007827 0.016389 0 -0.478 0.632937
## absdiff.life.sat 0.041134 0.165031 0 0.249 0.803167
## absdiff.risk -0.458684 0.146653 0 -3.128 0.001762 **
## absdiff.stress 0.077243 0.100320 0 0.770 0.441319
## absdiff.sport 0.038941 0.046349 0 0.840 0.400810
## absdiff.relig -0.013195 0.060848 0 -0.217 0.828320
## absdiff.tvhours -0.074884 0.040516 0 -1.848 0.064567 .
## mutual 1.991941 0.339958 0 5.859 < 1e-04 ***
## edges -5.329670 0.574304 0 -9.280 < 1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 2503.6 on 1806 degrees of freedom
## Residual Deviance: 632.1 on 1787 degrees of freedom
##
## AIC: 670.1 BIC: 774.6 (Smaller is better.)
absdiff.risk
is the only term that shows to be statistically significant. We can interpret the estimator as: students have a tendency ( with a probability of 0.43) to form ties with other students that have a similarly assessed risk aversion. The larger the difference between the risk aversion, the less likely the friendship.
We have explained the different kinds of terms that an ERG model can consider during the estimation process. This is only an introduction. There are more than the ones we went through here. In the case of endogenous terms, there is also the possibility of defining your own terms, but we are not going to get into doing so during this tutorial.
Instead, we are going to look at the model as a whole (instead of the different individual parts we have mentioned). We will use the information from the different steps we took to get here: what is statistically significant, and what is not. Model m17
considers both endogenous terms and node covariates:
m17 <- as.formula(nw ~
gwodegree(1, fixed = TRUE) +
gwidegree(1, fixed = TRUE) +
gwesp(.5, fixed = TRUE) +
nodematch('class') + nodefactor('class') +
nodematch('sex') + nodeifactor('sex') +
nodeicov('grade') +
nodeofactor('gaming') +
nodeofactor('likingschool') +
absdiff('risk') +
absdiff('tvhours') +
mutual +
edges)
fit17 <- ergm(m17, control = control.ergm(seed = 12345))
In this case, we are going to use the screenreg()
function instead of the usual summary
so that we can start thinking about exporting our results to a TeX file. We can set the different models nwe have been developing next to each other to see how the different coefficients compare:
screenreg(list(fit3a, fit16, fit17))
##
## ==========================================================================
## Model 1 Model 2 Model 3
## --------------------------------------------------------------------------
## gwodeg.fixed.0.5 -1.24 **
## (0.46)
## gwideg.fixed.0.5 -0.94
## (0.48)
## edges -2.16 *** -5.33 *** -4.20 **
## (0.14) (0.57) (1.31)
## nodematch.class 2.47 *** 1.58 ***
## (0.27) (0.26)
## nodefactor.class.15gD 0.18 0.06
## (0.12) (0.08)
## nodefactor.class.15gE 0.31 * 0.28 **
## (0.14) (0.08)
## nodematch.sex 1.38 *** 1.02 ***
## (0.21) (0.17)
## nodefactor.sex.male -0.20
## (0.14)
## nodeifactor.college.bound.1 -0.35
## (0.22)
## nodeofactor.gaming.1 0.56 * 0.32
## (0.25) (0.21)
## nodeofactor.likingschool.yes, a bit 0.56 0.44
## (0.30) (0.25)
## nodeofactor.likingschool.yes, very 1.38 *** 1.13 **
## (0.38) (0.37)
## absdiff.grade -0.23
## (0.26)
## absdiff.drink -0.01
## (0.02)
## absdiff.life.sat 0.04
## (0.17)
## absdiff.risk -0.46 ** -0.36 **
## (0.15) (0.13)
## absdiff.stress 0.08
## (0.10)
## absdiff.sport 0.04
## (0.05)
## absdiff.relig -0.01
## (0.06)
## absdiff.tvhours -0.07 -0.07 *
## (0.04) (0.03)
## mutual 1.99 *** 1.29 ***
## (0.34) (0.38)
## gwodeg.fixed.1 0.73
## (0.56)
## gwideg.fixed.1 0.63
## (0.55)
## gwesp.fixed.0.5 0.96 ***
## (0.18)
## nodeifactor.sex.male -0.24
## (0.17)
## nodeicov.grade -0.34
## (0.25)
## --------------------------------------------------------------------------
## AIC 951.83 670.14 627.00
## BIC 968.32 774.62 714.98
## Log Likelihood -472.91 -316.07 -297.50
## ==========================================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05
Now, suppose we would like to export that table to a paper. We can use the texreg()
function and its many options for this. This is only a small example. Other options include adding custom names for your coefficients, notes, number of digits, etc. Note: For the sake of this being an html document and not a pdf, we are using htmlreg()
but the function works exactly the same.
htmlreg(list(fit3a, fit16, fit17),
custom.model.names = c('endogenous', 'covariates', 'combo'),
single.row = TRUE, # adds SE on the same line as coefficients
caption = 'Comparison of ERG models',
fontsize = 'footnotesize')
endogenous | covariates | combo | ||
---|---|---|---|---|
gwodeg.fixed.0.5 | -1.24 (0.46)** | |||
gwideg.fixed.0.5 | -0.94 (0.48) | |||
edges | -2.16 (0.14)*** | -5.33 (0.57)*** | -4.20 (1.31)** | |
nodematch.class | 2.47 (0.27)*** | 1.58 (0.26)*** | ||
nodefactor.class.15gD | 0.18 (0.12) | 0.06 (0.08) | ||
nodefactor.class.15gE | 0.31 (0.14)* | 0.28 (0.08)** | ||
nodematch.sex | 1.38 (0.21)*** | 1.02 (0.17)*** | ||
nodefactor.sex.male | -0.20 (0.14) | |||
nodeifactor.college.bound.1 | -0.35 (0.22) | |||
nodeofactor.gaming.1 | 0.56 (0.25)* | 0.32 (0.21) | ||
nodeofactor.likingschool.yes, a bit | 0.56 (0.30) | 0.44 (0.25) | ||
nodeofactor.likingschool.yes, very | 1.38 (0.38)*** | 1.13 (0.37)** | ||
absdiff.grade | -0.23 (0.26) | |||
absdiff.drink | -0.01 (0.02) | |||
absdiff.life.sat | 0.04 (0.17) | |||
absdiff.risk | -0.46 (0.15)** | -0.36 (0.13)** | ||
absdiff.stress | 0.08 (0.10) | |||
absdiff.sport | 0.04 (0.05) | |||
absdiff.relig | -0.01 (0.06) | |||
absdiff.tvhours | -0.07 (0.04) | -0.07 (0.03)* | ||
mutual | 1.99 (0.34)*** | 1.29 (0.38)*** | ||
gwodeg.fixed.1 | 0.73 (0.56) | |||
gwideg.fixed.1 | 0.63 (0.55) | |||
gwesp.fixed.0.5 | 0.96 (0.18)*** | |||
nodeifactor.sex.male | -0.24 (0.17) | |||
nodeicov.grade | -0.34 (0.25) | |||
AIC | 951.83 | 670.14 | 627.00 | |
BIC | 968.32 | 774.62 | 714.98 | |
Log Likelihood | -472.91 | -316.07 | -297.50 | |
p < 0.001, p < 0.01, p < 0.05 |
We are now going to look into the marginal effects, following the same intution these have in traditional logistic models. The motivation for this is to determine the relationship between individual variables or interaction effects with the probability of two nodes forming a tie.
edgeprob()
Traditional regression analysis often uses the idea of marginal effects to check how a continuous variable affects the outcome of the dependent variable, not just at that variable’s mean, but at its full range. ERG models can do the same using the command edgeprob()
, which is part of the xergm
package.
This term calculates the probability of each dyad forming a tie for all the nodes in the network. After calculating all the probabilities, you can plot them for any variable you’ve included in your ERGM.
Let us consider a simple example using m17
, one of the three models we used in our comparison above. We would like to know more about the correlation between similiarities in risky behaviour and the probability of a link happening between two nodes. We do this by running:
alldyads <- edgeprob(fit17)
This command creates a new data frame with all dyads in an edge list, together with all the variable’s change statistics.
allnames <- matrix(data = names(alldyads), ncol = 1)
colnames(allnames) <- "Variable name"
kable_styling(kable(allnames), bootstrap_options = c( "condense"), full_width = FALSE)
Variable name |
---|
tie |
gwodegree |
gwidegree |
gwesp.fixed.0.5 |
nodematch.class |
nodefactor.class.15gD |
nodefactor.class.15gE |
nodematch.sex |
nodeifactor.sex.male |
nodeicov.grade |
nodeofactor.gaming.1 |
nodeofactor.likingschool.yes, a bit |
nodeofactor.likingschool.yes, very |
absdiff.risk |
absdiff.tvhours |
mutual |
edges |
i |
j |
t |
i.name |
j.name |
probability |
names()
shows the name of all the variables in this data frame.
The first one is the dependent variable: a 0/1 dummy that measures whether two students are friends or not. The column labeled \(i\) represents the sender in the edgelist, and the column \(j\) is the target in the edgelist. The column probability
gives you the predicted probability of two students being friends.
head(alldyads[, c("tie", "i", "j", "probability")])
## tie i j probability
## 1 0 1 2 0.284690363
## 2 0 1 3 0.008728328
## 3 0 1 4 0.011619183
## 4 0 1 5 0.005896995
## 5 0 1 6 0.006058462
## 6 0 1 7 0.007546676
A better way of showing this information is by plotting the information we are interested in. We can use ggplot()
for this.
ggplot(alldyads,
aes(x = absdiff.risk,
y = probability))+
geom_point() +
geom_smooth(method = 'lm')
On the plot you can see the absolute difference in risky behaviour on the \(x\)-axis, and the probabiltiy of a tie on the \(y\)-axis.
The graph shows the marginal effects: you can see that the negative correlation between having more dissimilar risk-related behaviours and the probability of ties being made.
We can explore this even further by analysing the differential effect in a group of interest. In this case, let us look at how the fact that two students are in the same class or not affects this probability. The correlation becomes more noticeable when students are in the same class. When they are not the probability of having a connection is so low that the difference in risky behaviour does not seem to matter. A possible explanation for this fact is that when two students are in the same class, there seems to be more social awareness of the implications of spending time and becoming friends with a person who has sees perceives risk differently.
ggplot(alldyads,
aes(x = absdiff.risk,
y = probability,
color = factor(nodematch.class))) +
geom_point() +
geom_smooth(method = 'lm')
interpret()
The interpret()
function allows us to look at the individual probabilities of a link between nodes or the probability of a particular tie that we are interested in. This is usually referred to as micro-level interpretations. The two different functionalities we are going to explore are: “node” and “tie”. The required parameters of the function are:
object
is either an ergm
object, a btergm
object, or a mtergm
object.type
is either “tie” or “node”.i
and j
have to be each one a single number representing the nodes we are interested in.i
, and more than one j
.We should note that looking at ties is interesting only for directed networks.
Let us look at an example for clarity. We are going to use the model resulting from m17
.
# Probability of a tie between node 1 and node 4
interpret(object = fit17, type = "tie", i = 1, j = 4)
## [1] 0.01161918
The probability of there being a link between nodes 1 and 4 is around {r round(interpret(object = fit17, type = "tie", i = 1, j = 4), 3)
.
Now we are going to look at the possible combinations of links between 1 and 5 through 8. The results table shows the probabilities of links having node 1 as a sender, and the others as receivers.
# Probability of a tie between node 1 and nodes 5 through 8
interpret(object = fit17, type = "node", i = 1, j = 5:8)
## probability Receiver 5 Receiver 6 Receiver 7 Receiver 8
## Sender 1 9.749668e-01 0 0 0 0
## Sender 1 7.427389e-09 1 1 1 1
## Sender 1 5.783480e-03 1 0 0 0
## Sender 1 5.942804e-03 0 1 0 0
## Sender 1 7.413707e-03 0 0 1 0
## Sender 1 5.449108e-03 0 0 0 1
## Sender 1 3.294183e-05 1 1 0 0
## Sender 1 4.109527e-05 1 0 1 0
## Sender 1 3.020521e-05 1 0 0 1
## Sender 1 4.222736e-05 0 1 1 0
## Sender 1 3.103730e-05 0 1 0 1
## Sender 1 2.633352e-04 0 0 1 1
## Sender 1 2.242533e-07 1 1 1 0
## Sender 1 1.648272e-07 1 1 0 1
## Sender 1 1.398472e-06 1 0 1 1
## Sender 1 1.436997e-06 0 1 1 1
Running ?interpret
provides a more thorough explanation of the function and a couple of additional examples.
Finally, after running the model, studying the coefficients, and trying to extract as much information as possible from the marginal plots, we need to assess the goodness-of-fit of the model. This step is a must: without it you are not going to be able to determine how as-expected is your model behaving. Always check the goodness-of-fit, and make sure that it is included in the reports that are written about the model.
The xergm
package (which we will look into more detail later on) is very useful for this particular purpose. The command is very simple, but the user can add some more complexity by including more terms to evaluate the fit.
Alternatively, you can use btergm::gof()
for a more comprehensive goodness of fit analysis.
gof.model17 <- btergm::gof(fit17, nsim = 50)
Running the function produces the tables which can be plotted later. The entire output is hidden in the previous chunk, but as a short example, this is what it looks like:
gof.model17$Degree$stats
## obs sim: mean median min max Pr(>z)
## 0 0 1.56 1.0 0 6 0.54035044
## 1 2 3.72 3.5 1 8 0.49962404
## 2 5 5.38 5.5 1 10 0.88143851
## 3 11 6.28 6.0 2 13 0.06394703
## 4 3 6.10 5.5 2 13 0.22371209
## 5 8 6.56 6.5 1 13 0.57194831
## 6 4 5.10 5.0 1 10 0.66593176
## 7 4 3.68 4.0 0 7 0.90005130
## 8 4 2.60 2.0 0 5 0.58267274
## 9 1 1.28 1.0 0 5 0.91249108
## 10 1 0.44 0.0 0 2 0.82603163
## 11 0 0.12 0.0 0 1 0.96243456
## 12 0 0.14 0.0 0 1 0.95617950
## 13 0 0.00 0.0 0 0 1.00000000
## 14 0 0.02 0.0 0 1 0.99373684
## 15 0 0.00 0.0 0 0 1.00000000
## 16 0 0.02 0.0 0 1 0.99373684
## 17 0 0.00 0.0 0 0 1.00000000
#pdf(file = 'gof_model17.pdf')
plot(gof.model17)
#dev.off()
The goodness-of-fit procedure simulates a large number of networks (defined by nsim =
) using the estimated coefficients. In every one of the graphs the bold lines in the frequency plots represent the values from the original network (i.e., your data) while the boxplots are the values from the simulated networks. Ideally, the mean of the simulated networks should overlap with the observed value. If they do not, you must return to your model and check which kinds of endogenous network terms your theory of the world might be missing.
Some other diagnostics:
Receiver-operating curve (ROC): This particular diagnostic checks the predictive power of your network, i.e. whether the dyade state in your observed network is identical (or very similar) to the one in your simulated networks. The curve measures the relative frequency of true-positive predictions and false-positive predictions. If the curve is close to the diagonal, it indicates that the true-positive and false-positive rates are equal, meaning that whatever process was simulated resembles random one. The ROC curve should be above the diagonal, the higher up it is the better, indicating that there are more true-positive predictions than false-positive ones. The Wikipedia page contains useful information on the ROC. Should only be used if your density is high.
Precision-recall curve: Similar to ROC, it measures how many true-positive cases and how many true-negative cases (called PPV, positive predictive values) there are compared to true-positive cases. Recall: trying to answer the question of: “how many 1 do we recognize as 1?”; Precision: trying to answer the question of “do we only predict the 1 as 1 or do we predict some 0 as 1?”
Modularity distribution: Modularity in the network measures how fragmented the nodes are. High values of modularity indicate a densly connected nodes within a specific group, and only few ties to other groups.
In the gof-function of the btergm
package you can specify a number of other statistcs that should be examined for goodness-of-fit. Type ?'gof-terms'
for the entire list.
# A shorter list of diagnostics than the default:
gof.model17b <- btergm::gof(fit17,
nsim = 50,
statistics = c( dsp,
ideg,
odeg,
triad.directed,
istar,
rocpr ))
gof.model17b
plot(gof.model17b)
##
## obs sim: mean median min max Pr(>z)
## 0 1533 1490.82 1485.0 1384 1615 0.9349
## 1 183 218.96 212.5 150 306 0.9445
## 2 64 71.06 75.0 29 102 0.9891
## 3 22 20.04 19.5 1 43 0.9970
## 4 4 4.44 3.0 0 13 0.9993
## 5 0 0.56 0.0 0 3 0.9991
## 6 0 0.12 0.0 0 2 0.9998
## 7 0 0.00 0.0 0 0 1.0000
##
## obs sim: mean median min max Pr(>z)
## 0 3 3.78 4 1 7 0.8077
## 1 8 6.74 7 2 13 0.6943
## 2 8 7.70 7 3 18 0.9254
## 3 9 8.48 8 4 14 0.8711
## 4 4 6.50 6 2 11 0.4354
## 5 4 4.88 4 0 11 0.7837
## 6 3 2.42 2 0 7 0.8564
## 7 3 1.50 1 0 5 0.6398
## 8 1 0.60 0 0 3 0.9007
## 9 0 0.26 0 0 2 0.9354
## 10 0 0.12 0 0 1 0.9701
## 11 0 0.02 0 0 1 0.9950
## 12 0 0.00 0 0 0 1.0000
##
## obs sim: mean median min max Pr(>z)
## 0 4 3.80 3.0 0 8 0.9495
## 1 8 7.02 7.0 3 11 0.7564
## 2 7 8.14 8.0 3 16 0.7182
## 3 7 7.52 7.5 3 13 0.8693
## 4 5 6.64 6.0 2 14 0.6037
## 5 6 4.66 5.0 0 9 0.6715
## 6 1 2.70 3.0 0 7 0.5905
## 7 5 1.36 1.0 0 7 0.2492
## 8 0 0.64 0.0 0 3 0.8395
## 9 0 0.30 0.0 0 1 0.9243
## 10 0 0.12 0.0 0 1 0.9697
## 11 0 0.10 0.0 0 1 0.9747
## 12 0 0.00 0.0 0 0 1.0000
##
## obs sim: mean median min max Pr(>z)
## 003 8643 8705.18 8675.5 8202 9511 0.9774
## 012 2213 2193.94 2225.5 1771 2652 0.9931
## 102 1154 1084.64 1086.0 654 1506 0.9748
## 021D 45 37.00 37.0 19 64 0.9971
## 021U 41 36.32 36.0 21 67 0.9983
## 021C 42 75.82 71.0 43 115 0.9877
## 111D 61 61.42 63.5 30 95 0.9998
## 111U 54 64.04 66.0 30 94 0.9964
## 030T 16 8.82 8.0 0 23 0.9974
## 030C 0 2.40 2.0 0 6 0.9991
## 201 17 19.98 19.0 3 38 0.9989
## 120D 9 6.68 7.0 1 12 0.9992
## 120U 15 7.80 8.0 2 13 0.9974
## 120C 4 8.86 9.0 1 19 0.9982
## 210 18 19.46 19.0 6 34 0.9995
## 300 0 0.00 0.0 0 0 1.0000
##
## obs sim: mean median min max Pr(>z)
## 0 0 0.00 0.0 0 0 1.0000
## 1 134 131.56 134.0 96 154 0.9804
## 2 235 221.40 232.5 101 326 0.8913
## 3 286 257.32 255.0 67 532 0.7732
## 4 244 226.26 197.5 29 709 0.8585
## 5 141 156.74 117.5 8 782 0.8743
## 6 52 86.00 43.5 1 686 0.7326
## 7 11 36.66 10.0 0 451 0.7966
## 8 1 11.64 1.0 0 210 0.9149
## 9 0 2.56 0.0 0 65 0.9795
## 10 0 0.34 0.0 0 12 0.9973
## 11 0 0.02 0.0 0 1 0.9998
## 12 0 0.00 0.0 0 0 1.0000
##
## ROC model ROC random PR model PR random
## 1 0.8589944 0.5423067 0.4153467 0.08020055
As mentioned above, degeneracy is one of the main problems researchers find when dealing with ERG models. A useful definition of this concept is given here, stating that degeneracy occurs when the model places disproportionate mass on only a few of the possible graph configurations. This means that the methods used to find convergence do not work as intendended and the resulting simulations are not adequate examples of the observed graph. Some research has been made to understand what makes some models experience degeneracy more than others (see here or here).
To assess degenarcy in one of our models we can use the mcmc.diagnostics()
function. All values from your sampling procedure should form a bell-curve. If you observe multiple peaks for a variable, this variable did not converge and the variable makes the model degenerate. Take it out and rerun your model.
#pdf(file = 'mcmc_diag.pdf')
mcmc.diagnostics(fit17)
## Warning in formals(fun): argument is not a function
#dev.off()
rm(list = ls())
if(!require(statnet)) install.packages("statnet",repos = "http://cran.us.r-project.org")
library("statnet")
if(!require(mice)) install.packages("mice",repos = "http://cran.us.r-project.org")
library("mice")
if(!require(texreg)) install.packages("texreg",repos = "http://cran.us.r-project.org")
library("texreg")
if(!require(xergm)) install.packages("xergm",repos = "http://cran.us.r-project.org")
library("xergm")
if(!require(ggplot2)) install.packages("ggplot2",repos = "http://cran.us.r-project.org")
library("ggplot2")
if(!require(kableExtra)) install.packages("kableExtra",repos = "http://cran.us.r-project.org")
library("kableExtra")
if(!require(GGally)) install.packages("GGally",repos = "http://cran.us.r-project.org")
library("GGally")
if(!require(scales)) install.packages("scales",repos = "http://cran.us.r-project.org")
library("scales")
## read data
el <- read.table('Edgelist_3Classes_Fall.csv', sep = ";",
header = TRUE)
dt <- read.table('Students_attributes_short.csv', sep = ";",
header = TRUE)
## create adjacency matrix
students <- unique(c(el$studentID, el$friend.ID.code))
mat <- matrix(0, nrow =length(students) , ncol = length(students))
colnames(mat) <- as.character(students)
rownames(mat) <- as.character(students)
for (i in 1:nrow(el)) {
row.index <- which(students == el[i, 1])
col.index <- which(students == el[i, 2])
mat[row.index, col.index] <- 1
}
## create network object
nw <- network(mat, directed = TRUE)
## add some attributes
identical(rownames(mat), dt$studentID)
att <- data.frame('studentID' = rownames(mat))
#impute some missing values for numeric variables
imp <- cbind(dt$studentID,
dt$marks.overall.num,
dt$drink.week.total,
dt$risk.beh,
dt$relig.part.num)
imp <- mice(imp)
imp <- complete(imp)
summary(dt$marks.overall.num)
summary(imp$V2)
# 0. class
att$class <- dt$class[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'class', as.character(att$class))
# 1. Gender
att <- data.frame('studentID' = rownames(mat))
att$sex <- dt$sex[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'sex', as.character(att$sex))
# 2. marks (= school grades, the Swiss way)
att$grade.overall <- imp$V2[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'grade', att$grade.overall)
# 3. stress.school.num (higher = more stress)
att$stress.school <- dt$stress.school.num[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'stress', att$stress.school)
# 4. life.satis.num (higher = happier)
att$life.satis.num <- dt$life.satis.num[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'life.sat', att$life.satis.num)
# 5. healthy.body.image (1 = healthy body image, 0 = not happy with body/weight/bmi, think they need to change even though their weight is normal)
att$healthy.body.image <- dt$healthy.body.image[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'body.image', as.character(att$healthy.body.image))
# 6. drink.week.total (nr of glasses)
att$drink.week.total <- imp$V3[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'drink', att$drink.week.total)
# 7. sport.hours (nr of sport hours)
att$sport.hours <- dt$sport.hours[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'sport', att$sport.hours)
# 8. risk behavior (higher = more inclined to take risks (drive drunk, no seatbelt, no helmet))
att$risk.beh <- imp$V4[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'risk', att$risk.beh)
# 9. college bound (1 = heading for university/college)
att$college.bound <- dt$college.bound[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'college.bound', as.character(att$college.bound))
# 10. relig.part.num (higher = more often attending religious services)
att$relig.part.num <- imp$V5[match(att$studentID, imp$V1)]
set.vertex.attribute(nw, 'relig', att$relig.part.num)
# 11. being a gamer or not (1 = gaming more than 2hr a week)
att$gaming <- dt$games[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'gaming', att$gaming)
# 12. watching TV (hr watching tv)
att$hr.tv <- dt$hr.tv[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'tvhours', att$hr.tv)
# 13. liking school
att$liking.school <- dt$liking.school[match(att$studentID, dt$studentID)]
set.vertex.attribute(nw, 'likingschool', as.character(att$liking.school))
nw %v% 'likingschool' = as.character(att$liking.school)
fit0 <- ergm(nw ~ edges)
summary(fit0)
coef0 <- as.numeric(coef(fit0)['edges'])
# The probability
exp(coef0)/(1 + exp(coef0))
# Number of edges using the 'summary function':
e <- as.numeric(summary(nw ~ edges))
# Number of nodes
n <- length(att$studentID)
# Edge probability:
e/(n*(n-1))
# We can use the same function from above
summary(nw ~ mutual)
m1 <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
edges) # Edges term, which should always be included, so we leave it at the end
fit1 <- ergm(m1)
summary(fit1)
m1b <- as.formula( nw ~ # The network
mutual('sex', diff = TRUE) + # How many ties are reciprocated?
edges) # Edges term, which should always be included, so we leave it at the end
fit1b <- ergm(m1b)
summary(fit1b)
summary(nw ~ idegree(0:10))
summary(nw ~ odegree(0:10))
m2a <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
idegree(1:7) + # in-degree for degrees 0-7
edges) # Edges term, which should always be included, so we leave it at the end
fit2a <- ergm(m2a)
summary(fit2a)
m2b <- as.formula( nw ~ # The network
mutual + # How many ties are reciprocated?
odegree(1:7) + # in-degree for degrees 0-7
edges) # Edges term, which should always be included, so we leave it at the end
fit2b <- ergm(m2b)
summary(fit2b)
m3a <- as.formula( nw ~ # The network
gwodegree(0.5, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 0.5
gwidegree(0.5, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 0.5
edges) # Edges term, which should always be included, so we leave it at the end
m3b <- as.formula( nw ~ # The network
gwodegree(1, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 1
gwidegree(1, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 1
edges) # Edges term, which should always be included, so we leave it at the end
m3c <- as.formula( nw ~ # The network
gwodegree(2, fixed = TRUE) + # Geometrically weighted OUTdegree with a decay of 2
gwidegree(2, fixed = TRUE) + # Geometrically weighted INdegree with a decay of 2
edges) # Edges term, which should always be included, so we leave it at the end
fit3a <- ergm(m3a)
fit3b <- ergm(m3b)
fit3c <- ergm(m3c)
screenreg(list(fit3a, fit3b, fit3c))
summary(nw ~ triangle + ttriple + ctriple)
m4 <- as.formula(nw ~ # The network
triangle + # The term to include triangles
edges) # Edges term, which should always be included, so we leave it at the end
fit4 <- ergm(m4)
summary(nw ~ esp(0:10))
m7 <- as.formula(nw ~
nodefactor('sex') + # Difference in connections conditional on gender
mutual +
edges) # edges term
fit7 <- ergm(m7)
summary(fit7)
m7b <- as.formula(nw ~
nodeofactor('sex') + # Difference in outgoing connections conditional on gender
nodeifactor('sex') + # Difference in incoming connections conditional on gender
mutual +
edges)
fit7b <- ergm(m7b)
summary(fit7b)
m8 <- as.formula(nw ~
nodefactor('sex') + # Incoming AND outgoing gender
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('likingschool') + # Outgoing likes school
nodeifactor('likingschool') + # Incoming likes school
nodeofactor('gaming') + # Outgoing considered a gamer?
nodeifactor('gaming') + # Incoming considered a gamer?
mutual +
edges)
fit8 <- ergm(m8)
summary(fit8)
m8b <- as.formula(nw ~
nodefactor('sex') + # Incoming AND outgoing gender
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('likingschool', base = -3) + # Outgoing likes school
nodeifactor('likingschool', base = 3) + # Incoming likes school
nodeofactor('gaming') + # Outgoing considered a gamer?
nodeifactor('gaming') + # Incoming considered a gamer?
mutual +
edges)
fit8b <- ergm(m8b)
summary(fit8b)
m9 <- as.formula(nw ~
nodecov('grade') +
mutual +
edges)
fit9 <- ergm(m9)
summary(fit9)
m9b <- as.formula(nw ~
nodeicov('grade') +
nodeocov('grade') +
mutual +
edges)
fit9b <- ergm(m9b)
summary(fit9b)
m10 <- as.formula(nw ~
nodeifactor('college.bound') + # Incoming bound to college?
nodeofactor('college.bound') + # Outgoing bound to college?
nodeifactor('likingschool') + # Incoming Like school
nodeofactor('likingschool') + # Incoming Like school
nodeicov('grade') + # Incoming Grades
nodeocov('grade') + # Outgoing Grades
nodeocov('stress') + # Outgoing Stress level
nodeicov('stress') + # Incoming Stress level
nodeocov('risk') + # Outgoing Risky behaviour
nodeicov('risk') + # Incoming Risky behaviour
nodeocov('tvhours') + # Outgoing hours watching tv
nodeicov('tvhours') + # Incoming hours watching tv
mutual +
edges)
fit10 <- ergm(m10)
summary(fit10)
m11 <- as.formula(nw ~
nodematch('class') + # Match between students of the same class
nodefactor('class') + # But also, does class have an effect in the nomination?
mutual +
edges)
fit11 <- ergm(m11)
summary(fit11)
m12 <- as.formula(nw ~
nodematch('class', diff = TRUE) + # Making the difference between classes explicit
nodefactor('class') + # But also, does class have an effect in the nomination?
mutual +
edges)
fit12 <- ergm(m12)
summary(fit12)
m12b <- as.formula(nw ~
nodematch('class', diff = TRUE, keep = 1) + # Keeping the first class in the estimation
nodefactor('class') + # But controlling for the class distribution
mutual +
edges)
fit12b <- ergm(m12b)
summary(fit12b)
# Notice how we keep the 'match' and 'class' terms in the same row to keep track of them
m13 <- as.formula(nw ~
nodematch('class') + nodefactor("class") +
nodematch('sex') + nodefactor('sex') +
nodematch('college.bound') + nodeifactor('college.bound') +
nodematch('body.image') + nodeifactor('body.image') + nodeofactor('body.image') +
nodematch('gaming') + nodeifactor('gaming') + nodeofactor('gaming') +
nodematch('likingschool', diff = TRUE) + nodeofactor('likingschool') +
mutual +
edges)
fit13 <- ergm(m13, control = control.ergm(seed = 12345))
summary(fit13)