Install sand and some additional packages to process and model the network data.
# ,echo=FALSE, include=FALSE
# install.packages("statnet")
if (!require("pacman")) install.packages("pacman")
pacman::p_load("ape",
"broom",
"d3r",
"ergm",
"ggplot2",
"jsonlite",
"tidyverse",
"igraph",
"lubridate",
"purrr",
"RColorBrewer",
"sand",
"sqldf",
"wrapr")
Load sand and some additional packages to process and model the network data.
# ,echo=FALSE, include=FALSE
library("ape")
library("broom")
library("d3r")
library("ergm")
library("ggplot2")
library("jsonlite")
library("dplyr")
library("igraph")
library("lubridate")
library("purrr")
library("RColorBrewer")
library("readr")
library("sand")
library("sqldf")
library("wrapr")
Lord of the Rings
Read CSV data from the morethanbooks Lord of the Rings Networks repository into a tibble.
ontology.csv contains the basic metadata about each entity (i.e. proper names used to reference characters, places, or groups) together with its identifier (e.g. the identifier for Aragorn is “arag”).
ontology = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/ontologies/ontology.csv"), sep = "\t"))
names(ontology) <- c("id", "type", "label", "freqsum", "subtype", "gender")
head(ontology)
## # A tibble: 6 × 6
## id type label freqsum subtype gender
## <chr> <chr> <chr> <int> <chr> <chr>
## 1 andu pla Anduin 109 pla ""
## 2 arag per Aragorn 1069 men "male"
## 3 arat per Arathorn 36 men "male"
## 4 arwe per Arwen 51 elves "female"
## 5 bage pla Bag End 77 pla ""
## 6 bali per Balin 30 dwarf "male"
Read CSV data from the morethanbooks Lord of the Rings Networks repository into a tibble.
networks-id-3books.csv contains an edges table with the number of times two entities are mentioned in the same paragraph across all three books of the series.
In this project, the nodes represent entities (i.e. proper names used to reference characters, places, or groups), and two of them are connected by an edge if in any paragraph there are references to these two entities.
Across the three books, Frodo and Sam are referenced in the same paragraph most frequently (533 paragraphs), and Frodo and Gandalf are referenced in the second most number of paragraphs(181 paragraphs).
books123 = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/tables/networks-id-3books.csv"), sep = ","))
books123 <- books123 %>%
dplyr::select("IdSource", "IdTarget", "Weight", "Type") %>%
dplyr::mutate("Type" = "Books 123",
"Weight" = as.double(Weight))
names(books123) <- c("source", "target", "weight", "volume")
head(books123)
## # A tibble: 6 × 4
## source target weight volume
## <chr> <chr> <dbl> <chr>
## 1 frod sams 533 Books 123
## 2 frod ganda 181 Books 123
## 3 merr pipp 162 Books 123
## 4 arag frod 146 Books 123
## 5 frod goll 127 Books 123
## 6 bilb frod 126 Books 123
books123 edgelist for an undirected graphWe can use sqldf to create a R data frame that combines the edges data from books123 and the metadata about the entities from ontology. The result is a data frame with all of the information we have about the paragraph references to pairs of entities across all three books.
network_df <- sqldf::sqldf("
SELECT
sour.id AS source_id, sour.label as source_name, sour.type AS source_type, sour.subtype AS source_subtype, sour.gender AS source_gender,
dest.id AS target_id, dest.label AS target_name, dest.type AS target_type, dest.subtype AS target_subtype, dest.gender AS target_gender,
conn.weight, conn.volume
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id
UNION
SELECT
dest.id AS source_id, dest.label as source_name, dest.type AS source_type, dest.subtype AS source_subtype, dest.gender AS source_gender,
sour.id AS target_id, sour.label AS target_name, sour.type AS target_type, sour.subtype AS target_subtype, sour.gender AS target_gender,
conn.weight, conn.volume
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id"
)
network_df %>%
dplyr::filter(source_name == "Frodo" | target_name == "Frodo", source_type == "per", target_type == "per") %>%
dplyr::arrange(source_id, desc(weight)) %>%
head(10)
## source_id source_name source_type source_subtype source_gender target_id
## 1 arag Aragorn per men male frod
## 2 arat Arathorn per men male frod
## 3 arwe Arwen per elves female frod
## 4 bali Balin per dwarf male frod
## 5 bere Beregond per men male frod
## 6 bilb Bilbo per hobbit male frod
## 7 bill Bill per animal male frod
## 8 boro Boromir per men male frod
## 9 cele Celeborn per elves male frod
## 10 dene Denethor per men male frod
## target_name target_type target_subtype target_gender weight volume
## 1 Frodo per hobbit male 146 Books 123
## 2 Frodo per hobbit male 2 Books 123
## 3 Frodo per hobbit male 6 Books 123
## 4 Frodo per hobbit male 4 Books 123
## 5 Frodo per hobbit male 1 Books 123
## 6 Frodo per hobbit male 126 Books 123
## 7 Frodo per hobbit male 9 Books 123
## 8 Frodo per hobbit male 68 Books 123
## 9 Frodo per hobbit male 1 Books 123
## 10 Frodo per hobbit male 7 Books 123
igraph has many functions for reading and writing graphs and converting to and from other data formats. We can create a network graph G from a R data frame using igraph’s graph_from_data_frame function.
my_edges <- sqldf::sqldf("
SELECT
sour.label as source_name, sour.type as source_type,
dest.label AS target_name, dest.type AS target_type,
conn.weight
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id"
) %>%
dplyr::filter(source_type == "per", target_type == "per", weight > 20) %>%
dplyr::select(source_name, target_name, weight) %>%
dplyr::rename(from = source_name, to = target_name)
my_nodes <- network_df %>%
dplyr::filter(source_type == "per", target_type == "per", weight > 20) %>%
dplyr::select(source_name, source_type, source_subtype, source_gender, volume) %>%
dplyr::rename(name = source_name, type = source_type, subtype = source_subtype, gender = source_gender) %>%
dplyr::distinct()
G <- graph_from_data_frame(my_edges, directed = FALSE, vertices = my_nodes)
#print_all(G)
G
## IGRAPH 962e5b6 UNWB 31 79 --
## + attr: name (v/c), type (v/c), subtype (v/c), gender (v/c), volume
## | (v/c), weight (e/n)
## + edges from 962e5b6 (vertex names):
## [1] Frodo --Sam Frodo --Gandalf Merry --Pippin Aragorn--Frodo
## [5] Frodo --Gollum Bilbo --Frodo Gandalf--Pippin Aragorn--Gandalf
## [9] Gollum --Sam Frodo --Pippin Gimli --Legolas Pippin --Sam
## [13] Frodo --Merry Aragorn--Legolas Aragorn--Sam Aragorn--Gimli
## [17] Gandalf--Saruman Boromir--Frodo Aragorn--Merry Aragorn--Pippin
## [21] Aragorn--Boromir Merry --Sam Aragorn--Elrond Gandalf--Théoden
## [25] Faramir--Frodo Bilbo --Gandalf Gandalf--Gimli Gandalf--Sam
## + ... omitted several edges
vertex.attributes(G)
## $name
## [1] "Aragorn" "Arathorn" "Arwen" "Beregond" "Bilbo" "Bill"
## [7] "Boromir" "Celeborn" "Denethor" "Elendil" "Elrond" "Éomer"
## [13] "Éowyn" "Faramir" "Frodo" "Galadriel" "Gandalf" "Gimli"
## [19] "Glóin" "Gollum" "Isildur" "Legolas" "Merry" "Pippin"
## [25] "Sam" "Saruman" "Sauron" "Shadowfax" "Théoden" "Bombadil"
## [31] "Treebeard"
##
## $type
## [1] "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per"
## [13] "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per"
## [25] "per" "per" "per" "per" "per" "per" "per"
##
## $subtype
## [1] "men" "men" "elves" "men" "hobbit" "animal" "men" "elves"
## [9] "men" "men" "elves" "men" "men" "men" "hobbit" "elves"
## [17] "ainur" "dwarf" "dwarf" "hobbit" "men" "elves" "hobbit" "hobbit"
## [25] "hobbit" "ainur" "ainur" "animal" "men" "ainur" "ents"
##
## $gender
## [1] "male" "male" "female" "male" "male" "male" "male" "male"
## [9] "male" "male" "male" "male" "female" "male" "male" "female"
## [17] "male" "male" "male" "male" "male" "male" "male" "male"
## [25] "male" "male" "male" "male" "male" "male" "male"
##
## $volume
## [1] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [7] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [13] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [19] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [25] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [31] "Books 123"
The following analyses are adapted from the book, Statistical Analysis of Network Data with R, 2nd Edition, by Eric Kolaczyk and Gabor Csardi, to analyze the LotR network.
Statistical Analysis of Network Data with R, 2nd Edition
LotR Network into CommunitiesUsing an agglomerative hierarchical clustering algorithm, implemented in igraph as cluster_fast_greedy, to determine the number of communities in the network, we identify 6 communities in the LotR network.
# Identify communities in the network using an agglomerative hierarchical clustering algorithm
lotr_c <- cluster_fast_greedy(G, weights = NULL)
str(lotr_c)
## List of 6
## $ merges : chr [1:6] "Aragorn" "Arathorn" "Arwen" "Elendil" ...
## $ modularity: chr [1:8] "Éomer" "Éowyn" "Gandalf" "Saruman" ...
## $ membership: chr [1:4] "Gimli" "Glóin" "Legolas" "Merry"
## $ names : chr [1:6] "Bilbo" "Bill" "Frodo" "Gollum" ...
## $ algorithm : chr [1:5] "Beregond" "Boromir" "Denethor" "Faramir" ...
## $ vcount : chr [1:2] "Celeborn" "Galadriel"
## - attr(*, "class")= chr "communities"
lotr_c[ 1:length(lotr_c) ]
## $`1`
## [1] "Aragorn" "Arathorn" "Arwen" "Elendil" "Elrond" "Isildur"
##
## $`2`
## [1] "Éomer" "Éowyn" "Gandalf" "Saruman" "Sauron" "Shadowfax"
## [7] "Théoden" "Treebeard"
##
## $`3`
## [1] "Gimli" "Glóin" "Legolas" "Merry"
##
## $`4`
## [1] "Bilbo" "Bill" "Frodo" "Gollum" "Sam" "Bombadil"
##
## $`5`
## [1] "Beregond" "Boromir" "Denethor" "Faramir" "Pippin"
##
## $`6`
## [1] "Celeborn" "Galadriel"
lotr_c
## IGRAPH clustering fast greedy, groups: 6, mod: 0.26
## + groups:
## $`1`
## [1] "Aragorn" "Arathorn" "Arwen" "Elendil" "Elrond" "Isildur"
##
## $`2`
## [1] "Éomer" "Éowyn" "Gandalf" "Saruman" "Sauron" "Shadowfax"
## [7] "Théoden" "Treebeard"
##
## $`3`
## [1] "Gimli" "Glóin" "Legolas" "Merry"
##
## + ... omitted several groups/vertices
# Number of communities
length(lotr_c)
## [1] 6
# Number of nodes (people) in each of the communities
sizes(lotr_c)
## Community sizes
## 1 2 3 4 5 6
## 6 8 4 6 5 2
# Display the community numbers and the nodes (people) in each of the communities
membership(lotr_c)
## Aragorn Arathorn Arwen Beregond Bilbo Bill Boromir Celeborn
## 1 1 1 5 4 4 5 6
## Denethor Elendil Elrond Éomer Éowyn Faramir Frodo Galadriel
## 5 1 1 2 2 5 4 6
## Gandalf Gimli Glóin Gollum Isildur Legolas Merry Pippin
## 2 3 3 4 1 3 3 5
## Sam Saruman Sauron Shadowfax Théoden Bombadil Treebeard
## 4 2 2 2 2 4 2
# Display the community designations
# shown in membership(lotr_c)
par(mfrow=c(1,1), mar=c(0,0,1,0))
plot(lotr_c, G)
title("Partitioning of the LotR network into communities")
par(mar=c(0,0,1,0))
dendPlot(lotr_c, mode = "phylo")
title("Dendrogram of communities in the LotR network")
We identified 6 communities in the LotR network using an agglomerative hierarchical clustering algorithm. Is this number of communities unexpected or unusual? To assess whether this number of communities is unusual, let’s compare this empirical outcome to the number of communities we find in random graphs that have similar properties to the LotR network:
Graphs that have the same number of nodes (31) and edges (79) as the LotR network
Graphs that have the further restriction that they have the same degree distribution as the LotR network
Using Monte Carlo methods, we can assess whether the number of communities we identified in the LotR network is unusual or to be expected by comparing it to the number of communities we identify in random graphs with similar properties.
# Number of nodes
nv <- vcount(G)
# Number of edges
ne <- ecount(G)
# Calculate the degree of each node
degs <- degree(G)
# Number of MC trials
ntrials <- 1000
# Calculate the number of communities across 1000 "G(n,m)" random graphs
num.comm.rg <- numeric(ntrials)
for(i in (1:ntrials)){
g.rg <- sample_gnm(nv, ne) # Returns a G(n,m) random graph
c.rg <- cluster_fast_greedy(g.rg, weights = NULL)
num.comm.rg[i] <- length(c.rg)
}
# Calculate the number of communities across 1000
# "G(n,m) + same degree distribution" random graphs
num.comm.grg <- numeric(ntrials)
for(i in (1:ntrials)){
g.grg <- sample_degseq(degs, method="vl") # Returns G(n,m) + same degree dist
c.grg <- cluster_fast_greedy(g.grg, weights = NULL)
num.comm.grg[i] <- length(c.grg)
}
# Calculate the proportion of trials with each Number of Communities
# both for the Fixed Size graphs and the Fixed Degree Sequence graphs
rslts <- c(num.comm.rg,num.comm.grg)
indx <- c(rep(0, ntrials), rep(1, ntrials))
freqs <- table(indx, rslts)/ntrials
# Plot the proportion of trials with each Number of Communities
# both for the Fixed Size graphs and the Fixed Degree Sequence graphs
barplot(freqs, beside=TRUE, col=c("steelblue", "darkorange"),
xlab="Number of Communities",
ylab="Relative Frequency",
legend=c("Fixed Size", "Fixed Degree Sequence"))
Based on this analysis, the number of communities we identified in the LotR network (6) is slightly unusual compared to the number we would expect to find based on random graphs with similar properties. The results suggest there are likely additional processes at work in the LotR network that go beyond simply the density and distribution of social interactions in the network.
A typical approach to assessing small-world properties is to compare the observed clustering coefficient and average (shortest) path length in an observed network to what might be observed in an appropriately calibrated random graph. Under such a comparison, if the observed network exhibits small-world properties, we should expect to see that the observed clustering coefficient exceeds that of a random graph, while the average path length remains roughly the same.
# Number of nodes
nv <- vcount(G)
# Number of edges
ne <- ecount(G)
# Number of MC trials
ntrials <- 1000
# Calculate global transitivity and average path length
# for connected "G(n,m)" random graphs
cl.rg <- numeric(ntrials)
apl.rg <- numeric(ntrials)
for(i in (1:ntrials)){
g.rg <- sample_gnm(nv, ne)
cl.rg[i] <- transitivity(g.rg, type = "globalundirected", weights = NULL)
apl.rg[i] <- mean_distance(g.rg, directed = FALSE, unconnected = FALSE)
}
# Global transitivity: random graph
round(summary(cl.rg), 3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.079 0.147 0.167 0.167 0.188 0.254
# Global transitivity: LotR network
round(transitivity(G), 3)
## [1] 0.453
# Average path length: random graph
round(summary(apl.rg), 3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.131 2.194 2.219 2.420 2.245 5.738
# Average path length: LotR network
round(mean_distance(G), 3)
## [1] 2.128
Here we find that the observed LotR network exhibits small-world properties, namely, the observed clustering coefficient (0.45) exceeds that of a random graph, while the average path length (2.13) remains roughly the same.
Exponential Random Graph Models (ERGMs) are a general class of models based in exponential-family theory, analogous to classical generalized linear models (GLMs), for specifying the probability distribution for a set of random graphs or networks. Like GLMs, ERGMs are flexible – for instance, it’s possible to include variables representing features like homophily, triad effects, and a range of other features of interest, such as the attributes of people in a social network. However, some of the theoretical frameworks underlying GLMs hasn’t been formally justified for ERGMs, so they should be used and interpreted carefully.
The general form of the model specifies the probability of the entire network, as a function of terms that represent network features we hypothesize may occur more or less likely than expected by chance. The general form of the model can be written as:
\[\begin{align} P(Y = y) = \frac{\exp(\theta g(y))}{k(\theta)} \end{align}\]
where - Y is the random variable for the state of the network - g(y) is a vector of model statistics (network “covariates”) for network y - \(\theta\) is a vector of coefficients for the statistics - \(k(\theta)\) is a normalization constant
The ERGM expression for the probability of the entire graph, shown above, can be re-expressed in terms of the conditional log-odds of a single tie between two actors. \(\theta\) can be interpreted as that term’s contribution to the log-odds of an individual tie, conditional on all other dyads remaining the same. The coefficient for each term in the model is multiplied by the number of configurations that tie will create (or remove) for that specific term.
The ergm package, part of the statnet suite of packages for network analysis, provides an integrated set of tools to analyze and simulate networks based on ERGMs. The ergm package uses the network package to store network data as network objects, so we need to convert our igraph object into a network object. The first step is to separate our graph into an adjacency matrix and a tibble of node attributes.
# Identify the largest connected component of the LotR network
largest_cc <- (clusters(G)$membership == 1)
conn_comp <- induced_subgraph(G, largest_cc)
# Remove animal and ents characters from the connected component
conn_comp <- delete.vertices(conn_comp,
V(conn_comp)[!subtype %in% c("men", "hobbit", "elves", "dwarf", "ainur")])
# Convert igraph object into an adjacency matrix
A <- as_adjacency_matrix(conn_comp)
# Specify levels for the Gender and Subtype attributes
v.attrs <- as_tibble(igraph::get.data.frame(conn_comp, what = "vertices")) %>%
dplyr::filter(subtype %in% c("men", "hobbit", "elves", "dwarf", "ainur")) %>%
mutate(gender = as.numeric(factor(gender,
levels = c("male", "female"))),
subtype = as.numeric(factor(subtype,
levels = c("men", "hobbit", "elves", "dwarf", "ainur")))
)
# Display the node attributes of the connected component
vertex.attributes(conn_comp)
## $name
## [1] "Aragorn" "Arathorn" "Arwen" "Beregond" "Bilbo" "Boromir"
## [7] "Denethor" "Elendil" "Elrond" "Éomer" "Éowyn" "Faramir"
## [13] "Frodo" "Gandalf" "Gimli" "Glóin" "Gollum" "Isildur"
## [19] "Legolas" "Merry" "Pippin" "Sam" "Saruman" "Sauron"
## [25] "Théoden" "Bombadil"
##
## $type
## [1] "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per"
## [13] "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per" "per"
## [25] "per" "per"
##
## $subtype
## [1] "men" "men" "elves" "men" "hobbit" "men" "men" "men"
## [9] "elves" "men" "men" "men" "hobbit" "ainur" "dwarf" "dwarf"
## [17] "hobbit" "men" "elves" "hobbit" "hobbit" "hobbit" "ainur" "ainur"
## [25] "men" "ainur"
##
## $gender
## [1] "male" "male" "female" "male" "male" "male" "male" "male"
## [9] "male" "male" "female" "male" "male" "male" "male" "male"
## [17] "male" "male" "male" "male" "male" "male" "male" "male"
## [25] "male" "male"
##
## $volume
## [1] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [7] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [13] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [19] "Books 123" "Books 123" "Books 123" "Books 123" "Books 123" "Books 123"
## [25] "Books 123" "Books 123"
# Levels: female male
factor(get.vertex.attribute(conn_comp, "gender"),
levels = c("male", "female"))
## [1] male male female male male male male male male male
## [11] female male male male male male male male male male
## [21] male male male male male male
## Levels: male female
as.numeric(factor(get.vertex.attribute(conn_comp, "gender")),
levels = c("male", "female"))
## [1] 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Levels: ainur animal dwarf elves ents hobbit men
factor(get.vertex.attribute(conn_comp, "subtype"),
levels = c("men", "hobbit", "elves", "dwarf", "ainur"))
## [1] men men elves men hobbit men men men elves men
## [11] men men hobbit ainur dwarf dwarf hobbit men elves hobbit
## [21] hobbit hobbit ainur ainur men ainur
## Levels: men hobbit elves dwarf ainur
as.numeric(factor(get.vertex.attribute(conn_comp, "subtype"),
levels = c("men", "hobbit", "elves", "dwarf", "ainur")))
## [1] 1 1 3 1 2 1 1 1 3 1 1 1 2 5 4 4 2 1 3 2 2 2 5 5 1 5
Create a network object for ergm. Add Gender and Subtype attributes to the nodes.
lotr.s <- network::as.network(as.matrix(A), directed=FALSE)
network::set.vertex.attribute(lotr.s, "Gender", v.attrs$gender)
network::set.vertex.attribute(lotr.s, "Subtype", v.attrs$subtype)
lotr.s
## Network attributes:
## vertices = 26
## directed = FALSE
## hyper = FALSE
## loops = FALSE
## multiple = FALSE
## bipartite = FALSE
## total edges= 71
## missing edges= 0
## non-missing edges= 71
##
## Vertex attribute names:
## Gender Subtype vertex.names
##
## No edge attributes
Plot the nodes of the LotR network (color nodes by Subtype).
set.seed(13)
par(mfrow=c(1,1), mar=c(0,0,1,0))
plot(lotr.s,
main="LotR Network",
cex.main=0.9,
label=network.vertex.names(lotr.s),
vertex.col='Subtype')
The syntax for specifying a model in the ergm package follows R’s formula convention:
my.network ∼ my.vector.of.model.terms
This syntax is used for both the summary and ergm functions. The summary function simply returns the numerical values of the network statistics in the model. The ergm function estimates the model with those statistics.
# View the g(y) statistic for this model
summary(lotr.s ~ edges)
## edges
## 71
# Fit the model
lotr.bern <- ergm(lotr.s ~ edges)
# View the fitted model object
summary(lotr.bern)
## Call:
## ergm(formula = lotr.s ~ edges)
##
## Maximum Likelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -1.2747 0.1342 0 -9.495 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 450.5 on 325 degrees of freedom
## Residual Deviance: 341.2 on 324 degrees of freedom
##
## AIC: 343.2 BIC: 347 (Smaller is better. MC Std. Err. = 0)
tidy(lotr.bern)
## # A tibble: 1 × 6
## term estimate std.error mcmc.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 edges -1.27 0.134 0 -9.50 2.20e-21
glance(lotr.bern)
## # A tibble: 1 × 5
## independence iterations logLik AIC BIC
## <lgl> <int> <dbl> <dbl> <dbl>
## 1 TRUE 5 -171. 343. 347.
This simple model specifies a single homogeneous probability for all ties, which is captured by the coefficient of the edges term. We can interpret this coefficient by returning to the logit form of the ERGM. The log-odds that a tie is present is
logit(p(y)) = \(\theta \times \delta(g(y))\)
= -1.28 \(\times\) change in the number of ties
= -1.28 \(\times\) 1
for every tie, since the addition of any tie to the network always increases the total number of ties by 1.
The corresponding probability is obtained by taking the expit, or inverse logit, of \(\theta\):
= exp(-1.28) / (1 + exp(-1.28)) = 0.218
plogis(coef(lotr.bern))
## edges
## 0.2184615
This probability corresponds to the density we observe in the LotR network: there are 71 ties and \(26 \choose 2\) = (\(26 \times 25\)) = 325 dyads, so the probability of a tie is 71 / 325 = 0.218.
Subtype may be associated with the connections in this network. We can use ergm to test this. Subtype is a discrete attribute, so we use the ergm-term nodematch to investigate homophily in connections by Subtype.
ordered_subtypes <- c("men", "hobbit", "elves", "dwarf", "ainur")
# Frequencies of Subtype
subtypes_tbl <- table(lotr.s %v% "Subtype")
names(subtypes_tbl) <- ordered_subtypes
subtypes_tbl
## men hobbit elves dwarf ainur
## 11 6 3 2 4
# View ties between Subtype categories
subtypes_mm <- mixingmatrix(lotr.s, "Subtype")
colnames(subtypes_mm) <- ordered_subtypes
rownames(subtypes_mm) <- ordered_subtypes
subtypes_mm
## men hobbit elves dwarf ainur
## men 12 13 3 1 9
## hobbit 13 11 4 2 7
## elves 3 4 1 1 2
## dwarf 1 2 1 1 1
## ainur 9 7 2 1 3
## Note: Marginal totals can be misleading for undirected mixing matrices.
# View the g(y) statistic for this model
# When diff=FALSE, this term adds one network statistic to the model,
# which counts the number of edges (i,j) for which attr(i)==attr(j).
# This is also called ”uniform homophily,” because each group is assumed
# to have the same propensity for within-group ties.
set.seed(619)
summary(lotr.s ~ edges +
nodefactor("Subtype") +
nodematch("Subtype", diff=FALSE))
## edges nodefactor.Subtype.2 nodefactor.Subtype.3
## 71 48 12
## nodefactor.Subtype.4 nodefactor.Subtype.5 nodematch.Subtype
## 7 25 28
Here we fit the model with the ergm function.
# Fit the model
# When diff=FALSE, this term adds one network statistic to the model,
# which counts the number of edges (i,j) for which attr(i)==attr(j).
# This is also called ”uniform homophily,” because each group is assumed
# to have the same propensity for within-group ties.
set.seed(619)
lotr.hom.formula <- formula(lotr.s ~ edges +
nodefactor("Subtype") +
nodematch("Subtype", diff=FALSE)) #levels=-c(2,5)
lotr.homophily <- ergm(lotr.hom.formula) #control=control.ergm(MCMLE.maxit = 30)
# View the model results
summary(lotr.homophily)
## Call:
## ergm(formula = lotr.hom.formula)
##
## Maximum Likelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -2.4635 0.3430 0 -7.183 < 1e-04 ***
## nodefactor.Subtype.2 1.0137 0.2576 0 3.935 < 1e-04 ***
## nodefactor.Subtype.3 0.2207 0.3591 0 0.615 0.538834
## nodefactor.Subtype.4 0.1234 0.4521 0 0.273 0.784831
## nodefactor.Subtype.5 0.7415 0.2971 0 2.496 0.012563 *
## nodematch.Subtype 1.2749 0.3426 0 3.722 0.000198 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 450.5 on 325 degrees of freedom
## Residual Deviance: 311.6 on 319 degrees of freedom
##
## AIC: 323.6 BIC: 346.3 (Smaller is better. MC Std. Err. = 0)
tidy(lotr.homophily)
## # A tibble: 6 × 6
## term estimate std.error mcmc.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 edges -2.46 0.343 0 -7.18 6.82e-13
## 2 nodefactor.Subtype.2 1.01 0.258 0 3.93 8.33e- 5
## 3 nodefactor.Subtype.3 0.221 0.359 0 0.615 5.39e- 1
## 4 nodefactor.Subtype.4 0.123 0.452 0 0.273 7.85e- 1
## 5 nodefactor.Subtype.5 0.742 0.297 0 2.50 1.26e- 2
## 6 nodematch.Subtype 1.27 0.343 0 3.72 1.98e- 4
glance(lotr.homophily)
## # A tibble: 1 × 5
## independence iterations logLik AIC BIC
## <lgl> <int> <dbl> <dbl> <dbl>
## 1 TRUE 5 -156. 324. 346.
By exponentiating the coefficients, they can be interpreted as conditional odds ratios for interaction (ties) between people in the LotR network. For example, being Hobbit rather than Men increases the odds of interaction by a factor of exp(1.0137) \(\approx\) 2.76, or over 175% (“all else being equal”). Converting this value into a probability, this corresponds to a difference in the probability of a tie of 11 percentage points, from 8% for Men to 19% for Hobbit. Similarly, being of the same Subtype increases the odds of interaction by a factor of exp(1.275) \(\approx\) 3.58, or over 250%.
# Subtype levels: men hobbit elves dwarf ainur
# Exponentiate the coefficients (conditional odds ratio)
exp(coef(lotr.homophily))
## edges nodefactor.Subtype.2 nodefactor.Subtype.3
## 0.08513709 2.75582219 1.24693831
## nodefactor.Subtype.4 nodefactor.Subtype.5 nodematch.Subtype
## 1.13136705 2.09910875 3.57840744
# Inverse-logit the coefficients (conditional probabilities)
# Probability of a tie if Men (not same Subtype)
round(plogis(coef(lotr.homophily)[1]), 3)
## edges
## 0.078
# Probability of a tie if Hobbit rather than Men (not same Subtype)
round(plogis(coef(lotr.homophily)[1] + coef(lotr.homophily)[2]), 3)
## edges
## 0.19
# Difference in probability of a tie if Hobbit rather than Men (not same Subtype)
round(plogis(coef(lotr.homophily)[1] + coef(lotr.homophily)[2]) - plogis(coef(lotr.homophily)[1]), 3)
## edges
## 0.112
The analysis of variance (ANOVA) table indicates that there is strong evidence that the variables used in the model explain the variation in network connectivity, with a decrease in residual deviance from 451 to 312 with only six variables.
anova(lotr.homophily)
## Analysis of Variance Table
##
## Model 1: lotr.s ~ edges + nodefactor("Subtype") + nodematch("Subtype",
## diff = FALSE)
## Df Deviance Resid. Df Resid. Dev Pr(>|Chisq|)
## NULL 325 450.55
## Model 1: 6 138.91 319 311.64 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Main effects and second-order (e.g. similarity or homophily effects) of node attributes can be incorporated into a ergm model with the terms nodemain and nodematch. For example, we can evaluate whether gender has a “main” effect and subtype has a “second-order” effect on the formation of collaborative ties among people in the LotR network, while accounting for the effects of transitivity, with the following model specification.
lotr.ergm <- formula(lotr.s ~ gwesp(1, fixed = TRUE)
+ nodemain("Gender")
+ nodematch("Subtype", diff=TRUE, levels=-c(4))
)
# View the g(y) statistic for this model
# Subtype levels: men hobbit elves dwarf ainur
summary(lotr.ergm)
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## 137.546 146.000 12.000 11.000
## nodematch.Subtype.3 nodematch.Subtype.5
## 1.000 3.000
Here we fit the model with the ergm function.
# , message=FALSE
set.seed(619)
lotr.ergm.fit <- ergm(lotr.ergm)
lotr.ergm.fit
##
## Call:
## ergm(formula = lotr.ergm)
##
## Last MCMC sample of size 1854 based on:
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1
## 0.7333 -1.6690 0.4291
## nodematch.Subtype.2 nodematch.Subtype.3 nodematch.Subtype.5
## 1.5615 2.7616 1.4497
##
## Monte Carlo Maximum Likelihood Coefficients:
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1
## 0.7321 -1.6688 0.4247
## nodematch.Subtype.2 nodematch.Subtype.3 nodematch.Subtype.5
## 1.5926 2.7573 1.3530
When dyad dependent terms are in the model, the computational algorithms in ergm use MCMC (with a Metropolis-Hastings sampler) to estimate the parameters. For these models, it is important to assess model convergence before interpreting the model results – before evaluating statistical significance, interpreting coefficients, or assessing goodness of fit. To do this, we use the function mcmc.diagnostics.
# View MCMC diagnostics
mcmc.diagnostics(lotr.ergm.fit)
## Sample statistics summary:
##
## Iterations = 25088:973824
## Thinning interval = 512
## Number of chains = 1
## Sample size per chain = 1854
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## gwesp.fixed.1 7.12295 56.8572 1.32048 4.06828
## nodecov.Gender 5.39213 42.4370 0.98558 3.11875
## nodematch.Subtype.1 0.62513 6.1429 0.14266 0.39485
## nodematch.Subtype.2 -0.12729 2.3969 0.05567 0.12232
## nodematch.Subtype.3 0.01348 0.7791 0.01809 0.02453
## nodematch.Subtype.5 0.32093 1.6931 0.03932 0.08128
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## gwesp.fixed.1 -104.9 -32.93 10.42 47.93 108.8
## nodecov.Gender -83.0 -22.00 10.00 36.00 77.0
## nodematch.Subtype.1 -10.0 -4.00 0.00 5.00 12.0
## nodematch.Subtype.2 -6.0 -2.00 0.00 2.00 3.0
## nodematch.Subtype.3 -1.0 0.00 0.00 0.00 2.0
## nodematch.Subtype.5 -3.0 -1.00 0.00 2.00 3.0
##
##
## Are sample statistics significantly different from observed?
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## diff. 7.12294512 5.39212513 0.6251348 -0.1272923
## test stat. 1.75084927 1.72893822 1.5832360 -1.0406283
## P-val. 0.07997188 0.08382015 0.1133677 0.2980481
## nodematch.Subtype.3 nodematch.Subtype.5 Overall (Chi^2)
## diff. 0.01348436 3.209277e-01 NA
## test stat. 0.54981079 3.948327e+00 17.188534871
## P-val. 0.58244916 7.869925e-05 0.009753957
##
## Sample statistics cross-correlations:
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1
## gwesp.fixed.1 1.0000000 0.9880177 0.74302029
## nodecov.Gender 0.9880177 1.0000000 0.75875048
## nodematch.Subtype.1 0.7430203 0.7587505 1.00000000
## nodematch.Subtype.2 0.2573615 0.2401768 0.01178483
## nodematch.Subtype.3 0.1566461 0.1953270 0.13265541
## nodematch.Subtype.5 0.4031580 0.4082623 0.19821161
## nodematch.Subtype.2 nodematch.Subtype.3 nodematch.Subtype.5
## gwesp.fixed.1 0.257361541 0.156646091 0.403157994
## nodecov.Gender 0.240176797 0.195327012 0.408262338
## nodematch.Subtype.1 0.011784826 0.132655415 0.198211606
## nodematch.Subtype.2 1.000000000 -0.005149296 0.036666944
## nodematch.Subtype.3 -0.005149296 1.000000000 0.007763918
## nodematch.Subtype.5 0.036666944 0.007763918 1.000000000
##
## Sample statistics auto-correlation:
## Chain 1
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## Lag 0 1.0000000 1.0000000 1.0000000 1.0000000
## Lag 512 0.7831386 0.8068116 0.7170667 0.5898898
## Lag 1024 0.6408121 0.6626669 0.5481991 0.3801373
## Lag 1536 0.5306541 0.5490714 0.4377498 0.2809911
## Lag 2048 0.4431243 0.4572219 0.3481825 0.2171275
## Lag 2560 0.3531568 0.3643441 0.2634831 0.1603669
## nodematch.Subtype.3 nodematch.Subtype.5
## Lag 0 1.00000000 1.0000000
## Lag 512 0.29487569 0.5915891
## Lag 1024 0.09302527 0.3795243
## Lag 1536 0.04412157 0.2415097
## Lag 2048 0.03611901 0.1627408
## Lag 2560 0.01211165 0.1192354
##
## Sample statistics burn-in diagnostic (Geweke):
## Chain 1
##
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5
##
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## -0.7041 -0.7285 -1.2700 0.9668
## nodematch.Subtype.3 nodematch.Subtype.5
## -0.3637 -0.9260
##
## Individual P-values (lower = worse):
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## 0.4813847 0.4663121 0.2040674 0.3336344
## nodematch.Subtype.3 nodematch.Subtype.5
## 0.7160994 0.3544633
## Joint P-value (lower = worse): 0.629981 .
##
## MCMC diagnostics shown here are from the last round of simulation, prior to computation of final parameter estimates. Because the final estimates are refinements of those used for this simulation run, these diagnostics may understate model performance. To directly assess the performance of the final model on in-model statistics, please use the GOF command: gof(ergmFitObject, GOF=~model).
Here we view the model results.
summary(lotr.ergm.fit)
## Call:
## ergm(formula = lotr.ergm)
##
## Monte Carlo Maximum Likelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## gwesp.fixed.1 0.7321 0.1185 0 6.177 < 1e-04 ***
## nodecov.Gender -1.6688 0.1648 0 -10.125 < 1e-04 ***
## nodematch.Subtype.1 0.4247 0.2668 0 1.592 0.111354
## nodematch.Subtype.2 1.5926 0.4543 0 3.506 0.000455 ***
## nodematch.Subtype.3 2.7573 1.3578 0 2.031 0.042282 *
## nodematch.Subtype.5 1.3530 0.6680 0 2.026 0.042814 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 450.5 on 325 degrees of freedom
## Residual Deviance: 269.7 on 319 degrees of freedom
##
## AIC: 281.7 BIC: 304.4 (Smaller is better. MC Std. Err. = 0.3104)
tidy(lotr.ergm.fit)
## # A tibble: 6 × 6
## term estimate std.error mcmc.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 gwesp.fixed.1 0.732 0.119 0 6.18 6.52e-10
## 2 nodecov.Gender -1.67 0.165 0 -10.1 4.26e-24
## 3 nodematch.Subtype.1 0.425 0.267 0 1.59 1.11e- 1
## 4 nodematch.Subtype.2 1.59 0.454 0 3.51 4.55e- 4
## 5 nodematch.Subtype.3 2.76 1.36 0 2.03 4.23e- 2
## 6 nodematch.Subtype.5 1.35 0.668 0 2.03 4.28e- 2
glance(lotr.ergm.fit)
## # A tibble: 1 × 5
## independence iterations logLik AIC BIC
## <lgl> <int> <dbl> <dbl> <dbl>
## 1 FALSE 28 -135. 282. 304.
By exponentiating the coefficients, they can be interpreted as conditional odds ratios for interaction (ties) between people in the LotR network. For example, being Female rather than Male decreases the odds of interaction by a factor of exp(-1.67) \(\approx\) 0.19, or nearly 80% (“all else being equal”). Similarly, being of the same Hobbit Subtype increases the odds of interaction by a factor of exp(1.593) \(\approx\) 4.92, or nearly 400%. In addition, the coefficient and standard error for the alternating k-triangle statistic indicate there is evidence for a nontrivial transitivity effect.
# Subtype levels: men hobbit elves dwarf ainur
# Exponentiate the coefficients (conditional odds ratio)
exp(coef(lotr.ergm.fit))
## gwesp.fixed.1 nodecov.Gender nodematch.Subtype.1 nodematch.Subtype.2
## 2.0795037 0.1884673 1.5291392 4.9166118
## nodematch.Subtype.3 nodematch.Subtype.5
## 15.7577362 3.8690224
# Inverse-logit the coefficients (conditional probabilities)
# Probability of a tie if Men (not same Subtype)
round(plogis(coef(lotr.ergm.fit)[1]), 3)
## gwesp.fixed.1
## 0.675
# Probability of a tie if Female rather than Men (not same Subtype)
round(plogis(coef(lotr.ergm.fit)[1] + coef(lotr.ergm.fit)[2]), 3)
## gwesp.fixed.1
## 0.282
# Difference in probability of a tie if Female rather than Men (not same Subtype)
round(plogis(coef(lotr.ergm.fit)[1] + coef(lotr.ergm.fit)[2]) - plogis(coef(lotr.ergm.fit)[1]), 3)
## gwesp.fixed.1
## -0.394
The analysis of variance (ANOVA) table indicates that there is strong evidence that the variables used in the model explain the variation in network connectivity, with a decrease in residual deviance from 563 to 335 with only six variables.
anova(lotr.ergm.fit)
## Analysis of Variance Table
##
## Model 1: lotr.s ~ gwesp(1, fixed = TRUE) + nodemain("Gender") + nodematch("Subtype",
## diff = TRUE, levels = -c(4))
## Df Deviance Resid. Df Resid. Dev Pr(>|Chisq|)
## NULL 325 450.55
## Model 1: 6 180.9 319 269.65 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To assess the goodness-of-fit of ERGMs, the current practice is to simulate many random graphs from the fitted model and then compare several summary statistics of these graphs to those of the original graph. If the summary statistics of the original graph don’t match the typical values of the fitted random graphs, then this suggests systematic differences between the model and the data and a lack of goodness-of-fit.
gof.lotr.ergm <- gof(lotr.ergm.fit)
gof.lotr.ergm
##
## Goodness-of-fit for degree
##
## obs min mean max MC p-value
## degree0 0 0 3.24 9 0.08
## degree1 5 0 2.09 8 0.20
## degree2 3 0 1.93 6 0.70
## degree3 1 0 2.24 6 0.74
## degree4 7 0 2.33 6 0.00
## degree5 0 0 2.15 7 0.22
## degree6 2 0 2.21 7 1.00
## degree7 2 0 2.40 6 1.00
## degree8 1 0 1.95 7 0.86
## degree9 1 0 1.77 7 1.00
## degree10 1 0 1.36 6 1.00
## degree11 0 0 0.89 4 1.00
## degree12 0 0 0.68 4 1.00
## degree13 0 0 0.36 2 1.00
## degree14 1 0 0.22 3 0.38
## degree15 0 0 0.12 2 1.00
## degree16 1 0 0.04 1 0.08
## degree17 1 0 0.02 1 0.04
##
## Goodness-of-fit for edgewise shared partner
##
## obs min mean max MC p-value
## esp0 8 0 4.29 13 0.26
## esp1 2 0 9.35 23 0.12
## esp2 12 0 13.70 25 0.86
## esp3 13 0 14.42 28 0.90
## esp4 7 0 11.71 32 0.56
## esp5 11 0 7.65 23 0.56
## esp6 10 0 4.12 20 0.28
## esp7 3 0 1.91 12 0.60
## esp8 2 0 0.64 8 0.30
## esp9 0 0 0.32 3 1.00
## esp10 1 0 0.08 2 0.14
## esp11 1 0 0.01 1 0.02
## esp12 0 0 0.01 1 1.00
## esp13 1 0 0.00 0 0.00
##
## Goodness-of-fit for minimum geodesic distance
##
## obs min mean max MC p-value
## 1 71 19 68.21 104 0.92
## 2 168 20 117.18 174 0.08
## 3 78 7 45.02 79 0.06
## 4 8 0 10.24 49 1.00
## 5 0 0 2.02 25 1.00
## 6 0 0 0.18 6 1.00
## 7 0 0 0.01 1 1.00
## Inf 0 0 82.14 254 0.08
##
## Goodness-of-fit for model statistics
##
## obs min mean max MC p-value
## gwesp.fixed.1 137.546 9 124.74 221.7522 0.76
## nodecov.Gender 146.000 38 137.22 210.0000 0.86
## nodematch.Subtype.1 12.000 1 11.22 26.0000 0.90
## nodematch.Subtype.2 11.000 3 10.84 15.0000 1.00
## nodematch.Subtype.3 1.000 0 0.92 3.0000 1.00
## nodematch.Subtype.5 3.000 0 2.96 6.0000 1.00
We can plot the results of the gof function. The results show that the model fit is moderate. The observed summary statistics are within the IQR of the simulated values in most cases (except for Subtype 3).
# fig.width = 6, fig.asp = 0.618
par(mfrow = c(2,2))
plot(gof.lotr.ergm)
Filippo Menczer, Santo Fortunato, and Clayton Davis. A First Course in Network Science. Cambridge University Press, 2020.
Eric Kolaczyk and Gabor Csardi. Statistical Analysis of Network Data with R, 2nd Edition. Springer, 2020.
Mark Newman. Networks, 2nd Edition. Oxford University Press, 2018.
Matthew Jackson. Social and Economic Networks. Princeton University Press, 2008.
Matthew Jackson. The Human Network: How Your Social Position Determines Your Power, Beliefs, and Behaviors. Vintage Books, 2020.
David Easley and Jon Kleinberg. Networks, Crowds, and Markets: Reasoning about a Highly Connected World. Cambridge University Press, 2010.