Rendering Social Networks with igraph
Visualizing graphs and analyzing networks using \(\textsf{igraph}\).
#install.packages(c("igraph", "igraphdata"))
# load packages
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(igraphdata)
# Let's first create an undirected network consists of three nodes and three edges using igraph
g1 <- make_graph(c(1,2, 2,3, 3,1), n = 3, directed = FALSE) # Note how edge list c() are defined
class(g1)
## [1] "igraph"
plot(g1)

# n (the number of nodes) can be greater than the number of edges in the edge list, meaning that many nodes are unconnected
g2 <- make_graph( edges=c(1,2, 2,3, 3,1), n = 10) # now with 10 vertices, and directed by default
plot(g2)

# Name the nodes
g3 <- make_graph(c("A","B", "B","C", "C","A")) # `n` will be ignored for edge list with vertex names
# When the edge list has vertex names, the number of nodes is not needed
plot(g3)

# Named nodes without edges --> 'isolates'
g4 <- make_graph( c("A", "B", "B", "C", "B", "C", "A", "A"), isolates=c("D", "E", "F", "G"))
set.seed(123)
plot(g4, edge.arrow.size = .5,
vertex.color = "yellow",
vertex.frame.color = "gray",
vertex.label.color = "black",
vertex.size = 8,
vertex.label.cex = 1.5,
vertex.label.dist = 2, # The distance of the label from the center of the node
edge.curved = 0.2)

# You can also use graph_from_literal() as a handy way to quickly create small graphs
# Undirected graph
plot(graph_from_literal(A-B, B-C)) # Two edges: node B to A, node B to C

# Directed graph
plot(graph_from_literal(A+-B, B-+C))

# Note: the number of dashes doesn't matter
# Use - for undirected edges, +- or -+ for directed edges, + means the direction pointing left or right. Use ++ for a symmetric relationships, and use : for sets of nodes, that is, set ABC to set DEF
plot(graph_from_literal(A-+B:C:D))

# Get edge, node and network attributes
# Plot a network
g5 <- make_graph( c("A", "B", "B", "C", "H", "C", "A", "E", "D", "C", "B", "E"),
isolates=c("F", "G"), directed = TRUE)
plot(g5)

# View the igraph object as an adjacency matrix
as_adjacency_matrix(g5)
## 8 x 8 sparse Matrix of class "dgCMatrix"
## A B C H E D F G
## A . 1 . . 1 . . .
## B . . 1 . 1 . . .
## C . . . . . . . .
## H . . 1 . . . . .
## E . . . . . . . .
## D . . 1 . . . . .
## F . . . . . . . .
## G . . . . . . . .
# View as a sparse matrix
as_adjacency_matrix(g5, sparse = FALSE)
## A B C H E D F G
## A 0 1 0 0 1 0 0 0
## B 0 0 1 0 1 0 0 0
## C 0 0 0 0 0 0 0 0
## H 0 0 1 0 0 0 0 0
## E 0 0 0 0 0 0 0 0
## D 0 0 1 0 0 0 0 0
## F 0 0 0 0 0 0 0 0
## G 0 0 0 0 0 0 0 0
# g5[4, ] # Find Node H's ties with other nodes
# add edge g5['A','H'] = 1
# Get edge list
E(g5)
## + 6/6 edges from 4e6fa6a (vertex names):
## [1] A->B B->C H->C A->E D->C B->E
# Count the number of edges
ecount(g5)
## [1] 6
# Get node (vertex) list
V(g5)
## + 8/8 vertices, named, from 4e6fa6a:
## [1] A B C H E D F G
# Count the number of nodes (vertices)
vcount(g5)
## [1] 8
# Find the neighbors of A
V(g5)[.nei("A")]
## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E
# Find the neighbors of A and B
g5[[c("A","B")]]
## $A
## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E
##
## $B
## + 2/8 vertices, named, from 4e6fa6a:
## [1] C E
# Alternatively, you can use
neighbors(g5,"A")
## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E
# Find nodes that D is directed to
E(g5)[.from("D")]
## + 1/6 edge from 4e6fa6a (vertex names):
## [1] D->C
# Are F and H tied?
E(g5)["F" %--% "H"]
## + 0/6 edges from 4e6fa6a (vertex names):
# Find the tie that goes from F to H
E(g5)["F" %->% "H"]
## + 0/6 edges from 4e6fa6a (vertex names):
# Find nodes having ties directed to B
E(g5)[.to("B")]
## + 1/6 edge from 4e6fa6a (vertex names):
## [1] A->B
# Get node attributes
V(g5)$name
## [1] "A" "B" "C" "H" "E" "D" "F" "G"
# Add node attributes, for example. country name
V(g5)$country <- c("USA", "UKG", "AUS", "JPN", "FRN", "DEU", "NOR", "RUS")
# Add edge attributes, let's add numerical values to denote the importance of an edge, such as trade volume. We have a total of 6 edges: length(E(g5))
E(g5)$type <- "Million USD" # Applied to all edges
E(g5)$value <- c(25, 10, 7, 8, 4, 1)
# Set these new attributes
g5 <- set_graph_attr(g5, "type", "value")
# Assign value as edge weights
E(g5)$weight <- E(g5)$value
is_weighted(g5)
## [1] TRUE
# See node and edge attributes
vertex_attr(g5)
## $name
## [1] "A" "B" "C" "H" "E" "D" "F" "G"
##
## $country
## [1] "USA" "UKG" "AUS" "JPN" "FRN" "DEU" "NOR" "RUS"
edge_attr(g5)
## $type
## [1] "Million USD" "Million USD" "Million USD" "Million USD" "Million USD"
## [6] "Million USD"
##
## $value
## [1] 25 10 7 8 4 1
##
## $weight
## [1] 25 10 7 8 4 1
# You can also remove (delete) certain attributes: g5 <- delete_graph_attr(g5, "value")
# Quickly check basic properties
g5
## IGRAPH 4e6fa6a DNW- 8 6 --
## + attr: type (g/c), name (v/c), country (v/c), type (e/c), value (e/n),
## | weight (e/n)
## + edges from 4e6fa6a (vertex names):
## [1] A->B B->C H->C A->E D->C B->E
# Serial number and letters refer to descriptive details of this graph object g5: U means this is an undirected graph; D means it is directed graph; N indicates it is a named graph.
# '--' refers to attributes not applicable to g:
# W would refer to a weighted graph --> edges have a weight attribute
# B would refer to a bipartite graph --> vertices have a type attribute
# 8 refers to the number of nodes (vertices) in g, 6 refers to the number of edges in g
# (g/c) means graph/character, (v/c) vertex/character, (e/n) edge/numeric, (e/c) edge/character
## Network Measures
# list degree centrality
degree(g5, v = V(g5), # The degree of a node records the number of its adjacent edges.
mode = c("all", "out", "in", "total"),
loops = TRUE, normalized = FALSE)
## A B C H E D F G
## 2 3 3 1 2 1 0 0
# in-degree
degree(g5, mode="in") # How many edges are pointing to a particular node
## A B C H E D F G
## 0 1 3 0 2 0 0 0
# out-degree
degree(g5, mode="out") # How many edges are from a particular node
## A B C H E D F G
## 2 2 0 1 0 1 0 0
# Eigenvector Centrality
# Eigenvector centrality is a measure of the influence of a node in a network. Scores are assigned to other nodes in the network based on the idea that connections to high-scoring nodes contribute more to the score of the node than to low-scoring nodes. A high eigenvector score means that a node is connected to many nodes who themselves have high scores.
evcent(g5)$vector
## Warning: `evcent()` was deprecated in igraph 2.0.0.
## ℹ Please use `eigen_centrality()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## A B C H E D
## 9.685752e-01 1.000000e+00 3.836254e-01 9.469184e-02 3.084933e-01 5.410962e-02
## F G
## 8.026701e-20 8.026701e-20
# Hubs and Authorities: similar measures to Eigenvector centrality
hub.score(g5)$vector
## Warning: `hub.score()` was deprecated in igraph 2.0.0.
## ℹ Please use `hits_scores()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## A B C H E D
## 1.0000000000 0.0138481147 0.0000000000 0.0015531986 0.0000000000 0.0008875421
## F G
## 0.0000000000 0.0000000000
authority.score(g5)$vector
## Warning: `authority.score()` was deprecated in igraph 2.0.0.
## ℹ Please use `hits_scores()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## A B C H E D
## 0.000000000 1.000000000 0.006116148 0.000000000 0.320553925 0.000000000
## F G
## 0.000000000 0.000000000
# Closeness
# Calculated as the reciprocal of the sum of the length of the shortest paths between the node and all other nodes in the graph. The more central a node is, the closer it is to all other nodes.
closeness(g5)
## A B C H E D F
## 0.01470588 0.09090909 NaN 0.14285714 NaN 0.25000000 NaN
## G
## NaN
# Betweeness
# A measure of how often a node is on the shortest path between other nodes in the network. The higher the measure, the more important a node is.
betweenness(g5)
## A B C H E D F G
## 0 1 0 0 0 0 0 0
# Generate empty graph
empty_g <- make_empty_graph(20)
plot(empty_g, vertex.size = 5, vertex.label=NA)

# Generate full graph
full_g <- make_full_graph(20)
plot(full_g, vertex.size = 5, vertex.label = NA)

# Star graph
star_g <- make_star(20)
plot(star_g, vertex.size = 5, vertex.label = NA)

# Generate scale-free Barabasi networks
bara_g <- sample_pa(200) # You can also use barabasi.game(200)
plot(degree_distribution(bara_g), xlab = "Node", ylab = "Degree") # Does degree distribution follow power law?

# Erdos-Renyi random graph
erdo_g <- sample_gnm(n = 20, m = 20) # You can also use erdos.renyi.game. n is the number of nodes, m the number of edges
## Options include directed = and loops =
plot(erdo_g, vertex.size = 5, vertex.label = NA)

# Generate a graph using graph_from_adjacency_matrix()
set.seed(123)
# Generate a sparse matrix by sampling from Bernoulli distribution of size 64.
adj_m <- matrix(sample(0:1, 64, replace = TRUE, prob = c(0.9, 0.1)),
nc = 8) # Number of columns
adj_m
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0
## [3,] 0 1 0 0 0 0 0 0
## [4,] 0 0 1 0 0 0 0 0
## [5,] 1 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0
## [7,] 0 0 0 1 0 0 0 0
## [8,] 0 0 1 1 0 0 0 0
g_adj <- graph_from_adjacency_matrix(adj_m,
diag = FALSE, # If this is FALSE then the diagonal is zeroed out first
mode = "max") # Default is directed, the latest version requires one to set mode = "max" in lieu of "undirected"
plot(g_adj)

## Generate graph from uploaded data (usually stored in edgelist format) using graph_from_edgelist()
# install.packages("remotes")
# remotes::install_github("RandiLGarcia/dyadr")
library(dyadr) # We'll be using the dyadic_trade data from Randi Garcia's dyadr package https://rdrr.io/github/RandiLGarcia/dyadr/man/dyadic_trade.html
data(dyadic_trade)
# Take a look at the data
head(dyadic_trade)
## ccode1 ccode2 year importer1 importer2 flow1 flow2 source1
## 1 2 20 1920 United States of America Canada 611.86 735.48 1
## 2 2 20 1921 United States of America Canada 335.44 442.99 1
## 3 2 20 1922 United States of America Canada 364.02 502.84 1
## 4 2 20 1923 United States of America Canada 416.00 598.14 1
## 5 2 20 1924 United States of America Canada 399.14 496.32 1
## 6 2 20 1925 United States of America Canada 473.72 608.35 1
## source2
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
# Subset (retrieve data from year 2009)
trade_df <- dyadic_trade[dyadic_trade$year == 2009, ]
# Data wrangling
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
d_trade = trade_df %>% dplyr::select(importer1, importer2, flow1) %>%
group_by(importer1, importer2) %>% # Organize the data into dyad format with 3 columns
summarise(USD_m = sum(flow1))
## `summarise()` has grouped output by 'importer1'. You can override using the
## `.groups` argument.
# Convert to dataframe and remove NAs
d_trade <- as.data.frame(d_trade)
d_trade <- d_trade[d_trade$importer1 != "", ]
d_trade <- d_trade[d_trade$importer2 != "", ]
d_trade <- d_trade[d_trade$USD_m != "", ]
head(d_trade)
## importer1 importer2 USD_m
## 2 Afghanistan Australia 25.27
## 3 Afghanistan Bangladesh 4.56
## 4 Afghanistan Brunei 0.03
## 5 Afghanistan Cambodia 1.98
## 6 Afghanistan China 239.26
## 7 Afghanistan Fiji 0.00
# Randomly sample 100 obs
set.seed(123)
d_trade <- sample_n(d_trade, 100)
# Convert to matrix
trade_m <- as.matrix(d_trade[, 1:2])
# Rescale trade volume (million USD), it will be used to represent edge width
d_trade$USD_m <- d_trade$USD_m / 80
trade_g <- graph_from_edgelist(trade_m, directed = FALSE)
plot(trade_g,
vertex.size = 6,
edge.width = d_trade[,3], # Set edge.width proportional to bilateral trade volume
vertex.color = "coral")

## Alternatively, you can also generate a network via graph_from_data_frame() that loads nodes (vertices) and edges separately. Here's my example: International Armament Collaboration data
# Load required packages
library(foreign)
# Load the data
vertices <- read.csv(url("https://www.dropbox.com/s/j1o1wh5w2ucvkje/vertices.csv?dl=1"), header = TRUE)
edges <- read.csv(url("https://www.dropbox.com/s/p78ndqv4olktifu/IAC_edges.csv?dl=1"), header = TRUE)
# Recode and generate color labels
is.na(vertices$alliance) <- vertices$alliance == ""
vertices$c_color <- c("white","orange","red","olivedrab1")[vertices$alliance] # Partners' alliance status
vertices$m_color <- ifelse(vertices$mutual_defense > 0,'black','grey80') # If a partner holds mutual defense treaty with the U.S.
color_alliance <- vertices %>%
group_by(alliance, c_color) %>%
summarise(n = n()) %>%
filter(!is.na(c_color))
## `summarise()` has grouped output by 'alliance'. You can override using the
## `.groups` argument.
color_defense <- vertices %>%
group_by(mutual_defense, m_color) %>%
summarise(n = n()) %>%
filter(!is.na(m_color))
## `summarise()` has grouped output by 'mutual_defense'. You can override using
## the `.groups` argument.
# Convert into graph format
g <- graph_from_data_frame(d = edges, vertices = vertices, directed = FALSE)
# Mapping
V(g)$color[V(g)$alliance == "NATO"] = "red"
V(g)$color[V(g)$alliance == "MNNA"] = "orange"
V(g)$color[V(g)$alliance == "USA"] = "olivedrab1"
w1 <- E(g)$IAC # Number of IAC cases with the U.S.
w1 <- log(w1) # Log transform
col = ifelse(V(g)$mutual_defense > 0,'black','grey80')
line_type <- ifelse(!is.na(E(g)$mutual_defense), 1, 2)
m1 <- layout_nicely(g)
V(g)$vertex_degree <- igraph::degree(g)
# Plotting
par(mar=c(1,1,1,1))
plot(g,
vertex.size = 7,
vertex.label.color = "black",
vertex.frame.color = col,
vertex.label.cex = 0.4,
edge.color = 'grey30',
edge.width = w1+1,
edge.lty = line_type,
edge.arrow.size = 0.6,
layout = m1)
legend("topleft", legend = c("Alliance type", as.character(color_alliance$alliance), "Mutual defense", as.character(color_defense$mutual_defense)), pch = c(1, 19, 19, 19, 19, 1), col = c(NA, color_alliance$c_color, NA, color_defense$m_color), pt.cex = 1, cex = 0.7, bty = "n", ncol = 1, title = "")
legend("topleft", legend = "", cex = 0.9, bty = "n", ncol = 1,
title = "Nature of Security Ties")

Social Network Analysis using Stochastic Actor-Oriented Models
(SAOM)
# Load required packages
library(RSiena)
library(statnet)
library(igraph)
library(dplyr)
# Load FTA matrix data which record global FTA ties between 1998 and 2019
FTA <- readRDS(url("https://www.dropbox.com/scl/fi/6luyfnrur7ictisun8zxh/FTA19982019.rds?rlkey=h6af5x79e318vm6q7rj8b8loc&st=bpi38ulh&dl=1"))
# Read in country name data, you can see a list of country name, abbreviation, and 3-digit iso code. We need them to index countries in our data
cid <- read.csv(url("https://www.dropbox.com/scl/fi/7isucibr6woxr5gnlthu8/countryid.csv?rlkey=cezf0zg1vbkcvxrrvazzu4pjf&st=uxw2yxdw&dl=1"))
# Convert to matrix format
for(i in 1:length(FTA)){
FTA[[i]] <- as.matrix(FTA[[i]])
# Set country name and transform the adjacency FTA matrix into binary network form
colnames(FTA[[i]]) <- cid$comtrade_id
rownames(FTA[[i]]) <- cid$comtrade_id
}
length(FTA)
## [1] 22
## Stacking FTA outcome network into a multidimensional array. We need FTA matrix from the last five years (2015 to 2019, 22 years!)
FTA2019 <- FTA[[22]]
FTA2018 <- FTA[[21]]
FTA2017 <- FTA[[20]]
FTA2016 <- FTA[[19]]
FTA2015 <- FTA[[18]]
# Quick question. Is global free trade on a secular rise? Let Jaccard index be our guide. Jaccard index is a statistic used for gauging the similarity between two count data. It is defined as: N11 / (N01 + N10 + N11), using FTA_change1819 for illustration, N11 is the number of FTAs existed in both periods (4375), N01 is the number of FTAs that were not in period 1 but were in period 2 (68) and N10 is the number of FTAs that were in period 1 but not in period 2 (0).
FTA_change1819 <- table(FTA[[18]], FTA[[19]])
FTA_change1819 # Similarity in global FTA networks between 2018 and 2019
##
## 0 1
## 0 21546 68
## 1 0 4307
4307/(4307+68)
## [1] 0.9844571
FTA_change1920 <- table(FTA[[19]], FTA[[20]])
FTA_change1920 # Similarity in global FTA networks between 2019 and 2020
##
## 0 1
## 0 21008 538
## 1 0 4375
4375/(4375+538)
## [1] 0.8904946
FTA_change2021 <- table(FTA[[20]], FTA[[21]])
FTA_change2021 # Similarity in global FTA networks between 2020 and 2021
##
## 0 1
## 0 20994 14
## 1 2 4911
4911/(4911+14+2)
## [1] 0.9967526
FTA_change2122 <- table(FTA[[21]], FTA[[22]])
FTA_change2122 # Similarity in global FTA networks between 2021 and 2022
##
## 0 1
## 0 20866 130
## 1 12 4913
4913/(4913+130+12)
## [1] 0.971909
# Well, it seems that the evolution of global free trade is relatively stable.
# Check row and column dimension: is it a square matrix?
nrow(FTA[[2]]) == ncol(FTA[[22]]) # Yes!
## [1] TRUE
net_size = nrow(FTA[[2]])
## Create a FTA array: five years, 161 by 161 observations in each year
FTA_array <- array(c(FTA2019 = FTA[[22]], FTA2019 = FTA[[21]], FTA2019 = FTA[[20]], FTA2019 = FTA[[19]], FTA2019 = FTA[[18]]),
dim = c(net_size, net_size, 5))
dim(FTA_array)
## [1] 161 161 5
## Create SIENA dependent variable network: one-mode network (all edge values are the same, either 0 or 1)
FTA_networks <- sienaDependent(netarray = FTA_array,
type = "oneMode",
allowOnly = FALSE) # Allow the dependent variable to both increase AND decrease
## Load country attributes data. You can see a list of country-level covariates, they will be used as regressors.
country_attr <- readRDS(url("https://www.dropbox.com/scl/fi/ddhf2z5tl9bo8l9jo29vb/country_attr.rds?rlkey=5upfqxcljtuphtuvingnptptm&st=9bc9dddy&dl=1"))
# Take a look at the data, just your average-looking data.frame object
names(country_attr)
## [1] "vertex.names" "cname" "year" "population"
## [5] "industrial_va" "service_va" "trade_open" "polity"
## [9] "execrlc" "liec" "bound_rate" "applied_rate"
## [13] "binding" "kaopen" "ipr" "binding_over"
## [17] "lGDP"
head(country_attr)
## vertex.names cname year population industrial_va service_va trade_open polity
## 1 4 AFG 2015 17.34187 22.12404 53.23529 55.3462 10
## 2 8 ALB 2015 14.88022 21.76366 46.28435 71.8010 20
## 3 24 AGO 2015 17.14772 41.93309 48.65989 62.8885 9
## 4 28 ATG 2015 11.44642 16.45798 69.88088 93.6295 21
## 5 32 ARG 2015 17.57671 23.15306 55.81490 22.4862 20
## 6 51 ARM 2015 14.88022 25.70870 48.21267 71.5933 16
## execrlc liec bound_rate applied_rate binding kaopen ipr binding_over
## 1 0 7 0.00 5.47 0.0 -2 0.0001958417 -5.47
## 2 -1 7 6.79 1.11 100.0 0 0.0040150162 5.68
## 3 -1 6 59.08 9.38 100.0 -2 0.0078045620 49.70
## 4 -1 6 58.75 12.03 97.5 1 0.0036434520 46.72
## 5 -1 7 31.74 7.35 100.0 -2 0.0265864835 24.39
## 6 0 7 8.68 2.93 100.0 2 0.0000000000 5.75
## lGDP
## 1 0.00000000
## 2 0.00000000
## 3 0.00000000
## 4 0.00000000
## 5 0.04520129
## 6 0.04498265
## Create time-varying variables: need to reshape the original data.frame to wide format AND in matrix form
# If you want to use a constant (time-invariant or fixed) variable, then use coCovar()
## Notations ##
#cc = coCovar
#vc = varCovar
#cdc = coDyadCovar
#vdc = varDyadCovar
library(reshape)
# Population
pop <- reshape(country_attr[, c(1, 4, 3)], idvar = "vertex.names", timevar = "year", direction = "wide")
pop_vc <- varCovar(as.matrix(pop[,-1])) # Make it into time-varying covariate, -1 means remove country id column
# Trade openness
trade_openness <- reshape(country_attr[, c(1, 7, 3)], idvar = "vertex.names", timevar = "year", direction = "wide")
to_vc <- varCovar(as.matrix(trade_openness[,-1])) # Also make it into time-varying covariate
## Create "dyadic" time-varying variables
# First, we need dyadic trade volume data between 2015 and 2019, recorded as log of million USD
trade2015 <- readRDS(url("https://www.dropbox.com/scl/fi/17i8e42xzhgo9dqdmzohn/trade2015.rds?rlkey=5ldvbtn4550vwrbobsu5lfuhx&st=ipp306c7&dl=1"))
trade2016 <- readRDS(url("https://www.dropbox.com/scl/fi/bevq047cg7q6p96qk8rk3/trade2016.rds?rlkey=x19pybshmqf45aqclc8audnqj&st=qn68qrog&dl=1"))
trade2017 <- readRDS(url("https://www.dropbox.com/scl/fi/1a2wvu3o28ur5ce462blr/trade2017.rds?rlkey=izxqr82e8iirr9h73b48i3yfx&st=gavsra6g&dl=1"))
trade2018 <- readRDS(url("https://www.dropbox.com/scl/fi/tvqyzcgqvkfo7kxpkn9j8/trade2018.rds?rlkey=sibcdornmgwzznb5myjfaeqju&st=bnssd0n7&dl=1"))
trade2019 <- readRDS(url("https://www.dropbox.com/scl/fi/gxustrrcx6kvw9137uttd/trade2019.rds?rlkey=jjvgc0ju17465bv6e9jjnebu3&st=cpgy2cry&dl=1"))
# Take a step back and select global FTA network in 2015 for visualization
g2015 <- graph_from_adjacency_matrix(FTA[[18]]) # Plot the FTA network in year 2015
g2015_edgelist <- as_edgelist(g2015) # Convert adjacent matrix to edgelist format
# Subset country attributes in year 2015
year2015_attributes <- country_attr[country_attr$year == 2015, ]
# Fill in dyadic trade volume data using the trade matrix loaded earlier
trade2015_df <- as.data.frame(as.table(trade2015))
trade2015_df <- trade2015_df[, c(2, 1, 3)] # Reorganize columns
names(trade2015_df) <- c("exporter", "importer", "trade_vol")
# Create an igraph object for plotting
year2015_net <- graph_from_data_frame(d = g2015_edgelist, directed = TRUE,
vertices = year2015_attributes)
# Setting layout of the graph
layout <- layout.fruchterman.reingold(year2015_net)
## Warning: `layout.fruchterman.reingold()` was deprecated in igraph 2.1.0.
## ℹ Please use `layout_with_fr()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plotting
plot(year2015_net,
vertex.label = country_attr[,2], # Label with country abbreviation
vertex.size = 10,
vertex.label.cex = 0.5, # Node label (country abbreviation) text size
layout = layout,
edge.arrow.size = .2,
edge.arrow.width = trade2015_df[,3], # Edge width proportional to dyadic trade volume
edge.color="light gray",
vertex.frame.color = NA)

# Back to RSiena pack and continue to create dyadic variables
# varDyadCovar() means time-varying dyadic variables
# For a data array of n by n by m periods, only m-1 periods of data will be used. The last array will be discarded (you cannot use data from the m-th period to predict the network formed in the same period)
trade_Dyadvar <- varDyadCovar(array(c(trade2015, trade2016, trade2017, trade2018), dim = c(161, 161, 4)))
FTA_covar <- sienaDataCreate(FTA_networks, pop_vc, to_vc, trade_Dyadvar) # The first argument is your network, the rest are your node and edge variables
# After building the model. We first get the base terms using getEffects() function, they are the covariates whose effects are to be estimated.
siena_effects <- getEffects(FTA_covar)
# We can use includeEffects() to add additional variables to the base model. For example, do countries tend to extend FTA ties to other countries that are already tied to its immediate neighbors? This term is called "transTrip" in RSiena. We can add that to the model via includeEffects().
siena_effects <- includeEffects(siena_effects, # siena object of effects
transTrip, # Name of the term to be included
name = "FTA_networks") # Name of the dependent variable for which effects are being included
## effectName include fix test initialValue parm
## 1 transitive triplets TRUE FALSE FALSE 0 0
# Set the effect of "population" as alter, altX, meaning that do alters (other countries) tend to extend FTAs to more populous focal countries (ego)?
siena_effects <- includeEffects(siena_effects, altX,
interaction1 = "pop_vc")
## effectName include fix test initialValue parm
## 1 pop_vc alter TRUE FALSE FALSE 0 0
# Also, do countries (ego) having higher level of trade openness tend to extend FTAs to other countries (alters)?
siena_effects <- includeEffects(siena_effects, egoX,
interaction1 = "to_vc")
## effectName include fix test initialValue parm
## 1 to_vc ego TRUE FALSE FALSE 0 0
# includeEffects() function is also necessary to set up the effects of covariates from an actor-oriented perspective, that is, from the perspective of the ego to alters or the other way around.
siena_effects <- includeEffects(siena_effects, X, # X for dyadic effect, check effectsDocumentation()
interaction1 = "trade_Dyadvar")
## effectName include fix test initialValue parm
## 1 trade_Dyadvar TRUE FALSE FALSE 0 0
# Model setting
# Before we estimate our model, we need to create an object of input specifications using the sienaAlgorithmCreate() function. For pedagogical purpose, I'll set most things at their default values. We'll also set seed = 123 for replication purpose.
input_options <- sienaAlgorithmCreate(projname = "FTA_model", # Model name (you can name it whatever you like)
# maxlike = TRUE, # Whether to use maximum likelihood method or Method of Moments estimation
# nsub = 5, # Number of iterations in phase 2
# n3 = 200, # Number of iterations in phase 3
seed = 123)
## If you use this algorithm object, siena07 will create/use an output file FTA_model.txt .
# We can now estimate our model using siena07() function
FTA.mod <- siena07(x = input_options,
data = FTA_covar, # Variable list
effects = siena_effects, # Effects to be estimated (from whose perspective)
batch= TRUE,
verbose = FALSE,
returnDeps = TRUE, # Whether to return a list of simulated dependent variables (set to TRUE if you will be running sienaGOF() later)
useCluster = TRUE, # Use multiple clusters
nbrNodes = 3) # Number of processes to use if useCluster = TRUE
##
## Start phase 0
## theta: -0.11 0.00 0.00 0.00 0.00 0.00
##
## Start phase 1
## Phase 1 Iteration 1 Progress: 0%
## Phase 1 Iteration 4 Progress: 0%
## Phase 1 Iteration 10 Progress: 0%
## Phase 1 Iteration 25 Progress: 0%
## Phase 1 Iteration 40 Progress: 1%
## theta: -0.88389 -0.07301 0.01653 0.00913 0.02597 -0.00187
##
## Start phase 2.1
## Phase 2 Subphase 1 Iteration 1 Progress: 14%
## Phase 2 Subphase 1 Iteration 2 Progress: 14%
## theta -1.08375 -0.09578 0.01783 0.01205 0.02774 -0.00202
## ac 1.191 1.195 1.224 1.682 0.934 0.977
## Phase 2 Subphase 1 Iteration 3 Progress: 14%
## Phase 2 Subphase 1 Iteration 4 Progress: 14%
## theta -1.72026 -0.17148 0.02239 0.02074 0.03027 -0.00244
## ac 1.253 1.262 1.295 0.948 1.002 1.102
## Phase 2 Subphase 1 Iteration 5 Progress: 14%
## Phase 2 Subphase 1 Iteration 6 Progress: 14%
## theta -2.41235 -0.25480 0.02832 0.02848 0.02129 -0.00312
## ac 1.263 1.273 1.302 0.994 1.074 0.951
## Phase 2 Subphase 1 Iteration 7 Progress: 15%
## Phase 2 Subphase 1 Iteration 8 Progress: 15%
## theta -2.85688 -0.28992 0.03095 0.03132 0.00743 -0.00384
## ac 1.264 1.273 1.301 1.224 1.097 0.915
## Phase 2 Subphase 1 Iteration 9 Progress: 15%
## Phase 2 Subphase 1 Iteration 10 Progress: 15%
## theta -3.16447 -0.28105 0.03221 0.03270 -0.00153 -0.00538
## ac 1.264 1.273 1.301 0.872 1.097 0.740
## Phase 2 Subphase 1 Iteration 200 Progress: 23%
## theta -8.52867 -0.25979 0.03401 0.03357 -0.00562 -0.11360
## ac 1.25589 1.21618 1.25405 0.00663 0.93627 0.71015
## theta -6.07499 -0.24566 0.03204 0.03265 -0.00384 -0.06836
## ac 1.2558 1.2142 1.2498 -0.0174 0.9169 0.7007
## theta: -6.07499 -0.24566 0.03204 0.03265 -0.00384 -0.06836
##
## Start phase 2.2
## Phase 2 Subphase 2 Iteration 1 Progress: 23%
## Phase 2 Subphase 2 Iteration 2 Progress: 23%
## Phase 2 Subphase 2 Iteration 3 Progress: 23%
## Phase 2 Subphase 2 Iteration 4 Progress: 23%
## Phase 2 Subphase 2 Iteration 5 Progress: 23%
## Phase 2 Subphase 2 Iteration 6 Progress: 23%
## Phase 2 Subphase 2 Iteration 7 Progress: 23%
## Phase 2 Subphase 2 Iteration 8 Progress: 23%
## Phase 2 Subphase 2 Iteration 9 Progress: 23%
## Phase 2 Subphase 2 Iteration 10 Progress: 23%
## Phase 2 Subphase 2 Iteration 200 Progress: 31%
## theta -8.01813 -0.25968 0.03161 0.03531 -0.00399 -0.10484
## ac 0.8557 0.5867 0.6490 -0.1304 0.0155 0.5001
## theta -7.14941 -0.25226 0.03211 0.03499 -0.00477 -0.08945
## ac 0.853 0.603 0.629 -0.108 0.030 0.502
## theta: -7.14941 -0.25226 0.03211 0.03499 -0.00477 -0.08945
##
## Start phase 2.3
## Phase 2 Subphase 3 Iteration 1 Progress: 32%
## Phase 2 Subphase 3 Iteration 2 Progress: 32%
## Phase 2 Subphase 3 Iteration 3 Progress: 32%
## Phase 2 Subphase 3 Iteration 4 Progress: 32%
## Phase 2 Subphase 3 Iteration 5 Progress: 32%
## Phase 2 Subphase 3 Iteration 6 Progress: 32%
## Phase 2 Subphase 3 Iteration 7 Progress: 33%
## Phase 2 Subphase 3 Iteration 8 Progress: 33%
## Phase 2 Subphase 3 Iteration 9 Progress: 33%
## Phase 2 Subphase 3 Iteration 10 Progress: 33%
## Phase 2 Subphase 3 Iteration 200 Progress: 41%
## theta -8.14842 -0.24740 0.03297 0.03598 -0.00547 -0.10653
## ac 0.9588 0.6807 0.7299 0.1104 0.0436 0.3567
## theta -7.83950 -0.25418 0.03291 0.03512 -0.00458 -0.10128
## ac 0.9460 0.6786 0.6680 0.1003 0.0592 0.4088
## theta: -7.83950 -0.25418 0.03291 0.03512 -0.00458 -0.10128
##
## Start phase 2.4
## Phase 2 Subphase 4 Iteration 1 Progress: 43%
## Phase 2 Subphase 4 Iteration 2 Progress: 44%
## Phase 2 Subphase 4 Iteration 3 Progress: 44%
## Phase 2 Subphase 4 Iteration 4 Progress: 44%
## Phase 2 Subphase 4 Iteration 5 Progress: 44%
## Phase 2 Subphase 4 Iteration 6 Progress: 44%
## Phase 2 Subphase 4 Iteration 7 Progress: 44%
## Phase 2 Subphase 4 Iteration 8 Progress: 44%
## Phase 2 Subphase 4 Iteration 9 Progress: 44%
## Phase 2 Subphase 4 Iteration 10 Progress: 44%
## Phase 2 Subphase 4 Iteration 200 Progress: 52%
## theta -8.31718 -0.25217 0.03233 0.03451 -0.00405 -0.10981
## ac 1.0194 0.7391 0.7676 0.0564 0.1617 0.4991
## theta -8.31899 -0.25168 0.03263 0.03533 -0.00451 -0.10963
## ac 0.9752 0.6915 0.7124 0.0302 0.1586 0.5069
## theta: -8.31899 -0.25168 0.03263 0.03533 -0.00451 -0.10963
##
## Start phase 3
## Phase 3 Iteration 1 Progress 59%
## Phase 3 Iteration 301 Progress 72%
## Phase 3 Iteration 601 Progress 84%
## Phase 3 Iteration 901 Progress 96%
# Check out the results. The estimated FTA.mod object include a column of estimated coefficients and simulated standard errors
summary(FTA.mod)
## Estimates, standard errors and convergence t-ratios
##
## Estimate Standard Convergence
## Error t-ratio
##
## Rate parameters:
## 0.1 Rate parameter period 1 0.8846 ( 0.0751 )
## 0.2 Rate parameter period 2 0.0995 ( 0.0258 )
## 0.3 Rate parameter period 3 3.3593 ( 0.1444 )
## 0.4 Rate parameter period 4 0.4222 ( 0.0526 )
##
## Other parameters:
## 1. eval outdegree (density) -8.3190 ( 37.5958 ) 1.4478
## 2. eval reciprocity -0.2517 ( 0.1700 ) 0.3059
## 3. eval transitive triplets 0.0326 ( 0.0085 ) 0.6765
## 4. eval trade_Dyadvar 0.0353 ( 0.0419 ) -0.0119
## 5. eval pop_vc alter -0.0045 ( 0.0283 ) -0.0087
## 6. eval to_vc ego -0.1096 ( 0.6371 ) 0.4898
##
## Overall maximum convergence ratio: 1.6278
##
##
## Total of 2159 iteration steps.
##
## Covariance matrix of estimates (correlations below diagonal)
##
## 1413.446 4.833 0.005 -1.528 0.728 23.952
## 0.756 0.029 0.000 -0.005 0.002 0.082
## 0.017 -0.130 0.000 0.000 0.000 0.000
## -0.969 -0.728 -0.048 0.002 -0.001 -0.026
## 0.684 0.515 -0.006 -0.684 0.001 0.012
## 1.000 0.756 0.019 -0.969 0.684 0.406
##
## Derivative matrix of expected statistics X by parameters:
##
## 43.031 8.673 475.559 -97.790 1.863 -2564.529
## 4.737 170.057 668.043 62.551 8.053 69.487
## 354.378 937.051 66191.595 4420.123 -188.983 -20171.928
## -107.219 79.664 5960.394 20779.714 828.706 -438.250
## -1.226 9.990 19.606 1016.502 2454.588 -2646.250
## -2544.360 -539.067 -27643.006 7055.313 -136.653 153571.571
##
## Covariance matrix of X (correlations below diagonal):
##
## 75.656 15.212 1091.634 -118.628 12.191 -1895.675
## 0.093 351.888 3178.457 396.599 6.382 2123.378
## 0.234 0.316 288291.922 22827.959 -1991.753 120409.456
## -0.065 0.101 0.202 44097.661 1403.566 -3050.761
## 0.028 0.007 -0.073 0.132 2565.054 -6922.330
## -0.146 0.076 0.150 -0.010 -0.092 2227533.506
# It looks like transitive triplets significantly increase the probability of i-j FTA relative to with other countries, while the three dyadic and nodal attributes are not significant. In fact, their estimated coefficients are in the opposite directions.
# Estimated coefficients are reported in odds-ratio format, so they should be interpreted as exponentiated odds. For example, we see a coefficient for trade_Dyadvar of 0.0353.
# This would suggest that the odds of an FTA from i being sent to j is exp(0.0353) - 1 times higher than being sent to k, assuming that i-j FTA exists but i-k FTA does not.
# You can also see how rate parameters vary across the four periods. The rate parameter is the lowest in period 2, meaning that FTA network changes less during this period, but the rate of change kind of accelerated in period 3.
# We're now ready to plot the goodness-of-fit statistics (gof), which compares deviations between the observed and simulated networks. It will calculate gof statistics for each period
# Low p-values would indicate a potential problem. Our result looks good.
gof1 <- sienaGOF(sienaFitObject = FTA.mod,
auxiliaryFunction = IndegreeDistribution, # Returns a named vector, the distribution of the observed or simulated in-degrees for the values in levels.
verbose = TRUE, # Whether to print immediate results
join = TRUE, # Boolean: should sienaGOF test all periods individually (FALSE), or sum across periods (TRUE)
varName = "FTA_networks")
## Detected 1002 iterations and 1 group.
## Calculating auxiliary statistics for periods 1 2 3 4 .
## Period 1
## > Completed 100 calculations > Completed 200 calculations > Completed 300 calculations > Completed 400 calculations > Completed 500 calculations > Completed 600 calculations > Completed 700 calculations > Completed 800 calculations > Completed 900 calculations > Completed 1000 calculations > Completed 1002 calculations
## Period 2
## > Completed 100 calculations > Completed 200 calculations > Completed 300 calculations > Completed 400 calculations > Completed 500 calculations > Completed 600 calculations > Completed 700 calculations > Completed 800 calculations > Completed 900 calculations > Completed 1000 calculations > Completed 1002 calculations
## Period 3
## > Completed 100 calculations > Completed 200 calculations > Completed 300 calculations > Completed 400 calculations > Completed 500 calculations > Completed 600 calculations > Completed 700 calculations > Completed 800 calculations > Completed 900 calculations > Completed 1000 calculations > Completed 1002 calculations
## Period 4
## > Completed 100 calculations > Completed 200 calculations > Completed 300 calculations > Completed 400 calculations > Completed 500 calculations > Completed 600 calculations > Completed 700 calculations > Completed 800 calculations > Completed 900 calculations > Completed 1000 calculations > Completed 1002 calculations
plot(gof1)

## Calculate prediction accuracy
# Although RSiena primarily focuses on estimating parameters for network evolution models rather than direct prediction accuracy measures like classification accuracy or AUC. However, we can still assess prediction accuracy using methods based on the goodness-of-fit (GOF), which we just did, or by comparing observed versus simulated networks.
# Extract first simulated network in period 2, which is stored in $sims
# The structure of sims is a nested list: sims[[run]][[group]][[dependent variable]][[period]]. If x$maxlike=TRUE and there is only one group (like our data here) and one period, then the structure is [[run]][[dependent variable]].
sim_net <- FTA.mod$sims[[1]][1]$Data1$FTA_networks$`2` # This means first simulated network from data 1 (we only have one here) in period 2. The output is a 3-column edgelist (i, j, tie) = [node i, node j, i-j tie]
# Examine the first 10 observations
head(sim_net, 10)
## [,1] [,2] [,3]
## [1,] 1 10 1
## [2,] 1 66 1
## [3,] 1 89 1
## [4,] 1 102 1
## [5,] 1 110 1
## [6,] 2 8 1
## [7,] 2 12 1
## [8,] 2 36 1
## [9,] 2 37 1
## [10,] 2 39 1
# We now fill in edgelist values into a 161 by 161 matrix
simulated_network <- matrix(0, nrow = 161, ncol = 161)
for(i in 1:nrow(sim_net)){
simulated_network[sim_net[i,1], sim_net[i,2]] = sim_net[i,3]
}
# Extract actual FTA network observed in period 2
observed_network <- FTA_covar$depvars[[1]][, , 2]
# Compute accuracy
correct_predictions <- sum(simulated_network == observed_network) / length(observed_network)
cat("New FTAs prediction accuracy:", correct_predictions)
## New FTAs prediction accuracy: 0.9993442
Re-analyze FTA formation using static and temporal Exponential
Random Graphs Models (ERGM)
# Load required packages
library(ergm)
library(btergm) # For binary temporal ERGM
library(speedglm) # For the use of glm inside btergm()
library(statnet)
library(network)
# Load FTA data again
FTA <- readRDS(url("https://www.dropbox.com/scl/fi/6luyfnrur7ictisun8zxh/FTA19982019.rds?rlkey=h6af5x79e318vm6q7rj8b8loc&st=bpi38ulh&dl=1"))
# Load cid again for country indexing
cid <- read.csv(url("https://www.dropbox.com/scl/fi/7isucibr6woxr5gnlthu8/countryid.csv?rlkey=cezf0zg1vbkcvxrrvazzu4pjf&st=uxw2yxdw&dl=1"))
# Pretty much the same, but this time, we need to tweak the data a bit by converting the data to network class via network()
for(i in 1:length(FTA)){
colnames(FTA[[i]]) <- cid$comtrade_id
rownames(FTA[[i]]) <- cid$comtrade_id
FTA[[i]] <- network(as.matrix(FTA[[i]]),
matrix.type = 'adjacency', # Convert to adjacency matrix
ignore.eval = TRUE,
directed = TRUE) # Directed network
}
FTA <- FTA[18:22] # Subset the last five years (2015-2019)
# Load dyadic trade data again
trade2015 <- readRDS(url("https://www.dropbox.com/scl/fi/17i8e42xzhgo9dqdmzohn/trade2015.rds?rlkey=5ldvbtn4550vwrbobsu5lfuhx&st=ipp306c7&dl=1"))
trade2016 <- readRDS(url("https://www.dropbox.com/scl/fi/bevq047cg7q6p96qk8rk3/trade2016.rds?rlkey=x19pybshmqf45aqclc8audnqj&st=qn68qrog&dl=1"))
trade2017 <- readRDS(url("https://www.dropbox.com/scl/fi/1a2wvu3o28ur5ce462blr/trade2017.rds?rlkey=izxqr82e8iirr9h73b48i3yfx&st=gavsra6g&dl=1"))
trade2018 <- readRDS(url("https://www.dropbox.com/scl/fi/tvqyzcgqvkfo7kxpkn9j8/trade2018.rds?rlkey=sibcdornmgwzznb5myjfaeqju&st=bnssd0n7&dl=1"))
trade2019 <- readRDS(url("https://www.dropbox.com/scl/fi/gxustrrcx6kvw9137uttd/trade2019.rds?rlkey=jjvgc0ju17465bv6e9jjnebu3&st=cpgy2cry&dl=1"))
## Combine annual dyadic trade into a list object
dyadic_trade <- list(trade2015, trade2016, trade2017, trade2018, trade2019)
## Load country attributes data (again)
country_attr <- readRDS(url("https://www.dropbox.com/scl/fi/ddhf2z5tl9bo8l9jo29vb/country_attr.rds?rlkey=5upfqxcljtuphtuvingnptptm&st=9bc9dddy&dl=1"))
# Setting nodal attributes within network objects using set.vertex.attribute() function, so that these variables will be attached to the array of network objects, FTA, as nodal attributes
for(i in unique(country_attr$year)){
population <- country_attr[country_attr$year == i, c("population")]
trade_open <- country_attr[country_attr$year == i, c("trade_open")]
network::set.vertex.attribute(FTA[[(i-2014)]], value = population, attrname = "population") # Use i - 2014 in order to sort the index in FTA list
network::set.vertex.attribute(FTA[[(i-2014)]], value = trade_open, attrname = "trade_open")
}
set.seed(123)
# Baseline model. select a single year (2015) and estimate tie probability of that year
FTA.ergm <- ergm(FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + nodecov("trade_open")) # Here we use edgecov() and nodecov() to create edge and node covariates inside ergm() function
## Starting maximum pseudolikelihood estimation (MPLE):
## Obtaining the responsible dyads.
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Evaluating log-likelihood at the estimate.
summary(FTA.ergm)
## Call:
## ergm(formula = FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") +
## nodecov("trade_open"))
##
## Maximum Likelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edgecov.dyadic_trade[[1]] 0.023837 0.002330 0 10.230 <1e-04 ***
## nodecov.population -0.067472 0.001540 0 -43.819 <1e-04 ***
## nodecov.trade_open 0.001270 0.000184 0 6.898 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 35711 on 25760 degrees of freedom
## Residual Deviance: 23158 on 25757 degrees of freedom
##
## AIC: 23164 BIC: 23189 (Smaller is better. MC Std. Err. = 0)
# Wow, previously insignificant variables now become significant! ERGM does appear better at predicting ties.
# Adding network structural characteristics
# density: equivalent to "edges" or "istar(1)" or "ostar(1)" divided by n*(n-1).
# mutual: reciprocity term
# twopath: this term adds one statistic to the model, equal to the number of 2-paths in the network i->j->k
# Let's add the "twopath" term as an exploratory probe. Please do take a look at the verbose log of the Monte Carlo maximum likelihood (MCMLE) process
FTA_structural.ergm <- ergm(FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + nodecov("trade_open") + twopath,
control = control.ergm(parallel = 4, # Use 4 CPU cores
parallel.type = "PSOCK",
MCMC.samplesize = 5000)) # Number of network statistics, randomly drawn from a given distribution on the set of all networks, returned by the Metropolis-Hastings algorithm.
## Starting maximum pseudolikelihood estimation (MPLE):
## Obtaining the responsible dyads.
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 60:
## Warning: 'glpk' selected as the solver, but package 'Rglpk' is not available;
## falling back to 'lpSolveAPI'. This should be fine unless the sample size and/or
## the number of parameters is very big.
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 3.3274.
## Estimating equations are not within tolerance region.
## Iteration 2 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.4189.
## Estimating equations are not within tolerance region.
## Iteration 3 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0406.
## Convergence test p-value: 0.0721. Not converged with 99% confidence; increasing sample size.
## Iteration 4 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0133.
## Convergence test p-value: 0.5063. Not converged with 99% confidence; increasing sample size.
## Iteration 5 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0228.
## Convergence test p-value: 0.1542. Not converged with 99% confidence; increasing sample size.
## Iteration 6 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0219.
## Convergence test p-value: 0.0091. Converged with 99% confidence.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Fitting the dyad-independent submodel...
## Bridging between the dyad-independent submodel and the full model...
## Setting up bridge sampling...
## Using 16 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 .
## Bridging finished.
##
## This model was fit using MCMC. To examine model diagnostics and check
## for degeneracy, use the mcmc.diagnostics() function.
summary(FTA_structural.ergm)
## Call:
## ergm(formula = FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") +
## nodecov("trade_open") + twopath, control = control.ergm(parallel = 4,
## parallel.type = "PSOCK", MCMC.samplesize = 5000))
##
## Monte Carlo Maximum Likelihood Results:
##
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edgecov.dyadic_trade[[1]] 1.813e-02 2.066e-03 0 8.777 <1e-04 ***
## nodecov.population -8.873e-02 1.760e-03 0 -50.427 <1e-04 ***
## nodecov.trade_open 1.896e-05 1.564e-04 0 0.121 0.903
## twopath 1.743e-02 9.250e-04 0 18.837 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 35711 on 25760 degrees of freedom
## Residual Deviance: 22951 on 25756 degrees of freedom
##
## AIC: 22959 BIC: 22991 (Smaller is better. MC Std. Err. = 3.146)
# After adding twopaths, trade_open lost its significance.
Longitudinal Analysis of FTA formation: BTERGM
# Fit a plain vanilla base btergm
FTA.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open"))
# Adding memory term, note that multiple memory term options cannot be used together (you can only choose one)
# NOT RUN
# Dyadic stability (type = "stability"): checks whether both edges and non-edges are stable between previous and current network
FTA_stability.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "stability"))
# You may also consider positive autoregression (type = "autoregression"): checks whether previous ties are carried over to the current network
# Lagged by 1 period
FTA_auto.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "autoregression", lag = 1))
Results.
# btergm FTA estimation results
FTA.btergm <- readRDS(url("https://www.dropbox.com/scl/fi/x4c2c56o21ryxarlhfjbw/FTA.btergm.rds?rlkey=z7hu0v1z7u2qf21vne1t2r1ix&st=475rvyzi&dl=1"))
FTA_stability.btergm <- readRDS(url("https://www.dropbox.com/scl/fi/0yf039ydu9ohuxzq5ba7r/FTA_stability.btergm.rds?rlkey=gvp889g0r4v4igdcxsxkoe3sw&st=r7estp39&dl=1"))
# Results at a glance
summary(FTA.btergm)
## ==========================
## Summary of model fit
## ==========================
##
## Formula: FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open")
## Time steps: 5
## Bootstrapping sample size: 500
## Estimates and 95% confidence intervals:
## Estimate Boot mean 2.5% 97.5%
## edgecov.dyadic_trade[[i]] 5.6619e-02 0.05712394 0.0518 0.0635
## nodecov.population -6.9177e-02 -0.06946778 -0.0735 -0.0654
## nodecov.trade_open -6.2946e-05 -0.00007736 -0.0002 0.0000
summary(FTA_stability.btergm)
## ==========================
## Summary of model fit
## ==========================
##
## Formula: FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "stability")
## Time steps: 4
## Bootstrapping sample size: 500
## Estimates and 95% confidence intervals:
## Estimate Boot mean 2.5% 97.5%
## edgecov.dyadic_trade[[i]] 0.0309516 0.0243176 -0.0014 0.0372
## nodecov.population 0.0144705 0.0183102 0.0117 0.0291
## nodecov.trade_open 0.0018269 0.0015195 -0.0007 0.0029
## edgecov.memory[[i]] 5.8766994 6.0160961 5.3119 7.3824
# Only population and structural variables remain significant.
Assessing prediction accuracy of BTERGM
# NOT RUN
# Use simulated networks (via gof() function) from the estimation of [t, t+1] data and compare them to the actual network at t+2
# Model 1 (baseline model)
pred1_auc <- list()
for (i in 1:3){ # A total of 3 predictions will be made
train_t <- c(i, i+1) # Set time frame (use two years' data as training data)
dy_trade <- dyadic_trade[train_t] # Subset training data time frame
model1_fit <- btergm(FTA[train_t] ~
edgecov(dy_trade) + nodecov("population") + nodecov("trade_open"), # edge and nodal covariates
R = 1000, # 1000 bootstrap replications
parallel = "multicore", # Parallel and multi-core processing
ncpus = 4) # Number of CPU cores used
model1_test <- i+2 # Number of predictions: 3 predictions
g <- gof(model1_fit, statistics = c(rocpr), nsim = 100, target = FTA[model1_test]) # Retrieve ROC-Precision metric
pred1_auc[[i]] <- g$`Tie prediction` # Store tie prediction result in pred1_auc list
}
# Model 2 (baseline model w/ stability term)
pred2_auc <- list()
for (i in 1:3){
train_t <- c(i+1, i+2)
dy_trade <- dyadic_trade[train_t]
model2_fit <- btergm(FTA[train_t] ~
edgecov(dy_trade) + nodecov("population") + nodecov("trade_open") + + memory(type = "stability"),
R = 1000, # 1000 bootstrap replications
parallel = "multicore", # Parallel and multi-core processing
ncpus = 4) # Number of CPU cores used
model2_test <- i+2
g <- gof(model2_fit, statistics = c(rocpr), nsim = 100, target = FTA[model2_test])
pred2_auc[[i]] <- g$`Tie prediction`
}
Create dataframe for plotting
# Load previously generated prediction result list
pred1_auc <- readRDS(url("https://www.dropbox.com/scl/fi/hzws9swlsytyybextjn9r/pred1_auc.rds?rlkey=c2hsmntk3w18o9n4ci3lcvcpk&st=1gmvilhu&dl=1"))
pred2_auc <- readRDS(url("https://www.dropbox.com/scl/fi/ru05hs3xymsqzlfdq8byv/pred2_auc.rds?rlkey=fbr3pmjgzuek7h4x4e9tl46yd&st=l1da7non&dl=1"))
# Take a look at the estimated model AUC
pred1_auc[[1]]
##
## ROC model ROC random PR model PR random
## 1 0.5444093 0.5103156 0.2170958 0.1958238
# As expected, PRs are generally lower
pred2_auc[[1]]
##
## ROC model ROC random PR model PR random
## 1 0.9610709 0.5091163 0.9487324 0.1925754
# After accounting for the ties of previous networks, both ROC AUC and PR AUC rose to near .99
# Create dataframe and fill two new data columns with NA
mod1_auc <- data.frame(Model = "Baseline", year = 2017:2019)
mod1_auc$auc.roc <- NA
mod1_auc$auc.pr <- NA
for (i in 1:3){
mod1_auc$auc.roc[i] <- pred1_auc[[i]]$auc.roc # Subset the AUC of ROC
mod1_auc$auc.pr[i] <- pred1_auc[[i]]$auc.pr # Subset the AUC of PR
}
mod2_auc <- data.frame(Model = "Dyadic Stability", year = 2017:2019)
mod2_auc$auc.roc <- NA
mod2_auc$auc.pr <- NA
for (i in 1:3){
mod2_auc$auc.roc[i] <- pred2_auc[[i]]$auc.roc
mod2_auc$auc.pr[i] <- pred2_auc[[i]]$auc.pr
}
# Combine data
df <- rbind(mod1_auc, mod2_auc)
df$year <- as.factor(df$year)
# ROC & PR AUC
library(ggplot2)
roc_auc <- ggplot(df, aes(x = year, y = auc.roc, group = Model)) +
geom_line(aes(linetype = Model, color = Model), lwd = 0.75, alpha = 0.75) +
scale_color_manual(values = c("#F8766D", "#00BFC4")) +
scale_linetype_manual(values = c("dashed", "solid")) +
theme_bw() +
labs(title = "ROC", x ="Year", y = "AUC") +
theme(plot.title = element_text(hjust = 0.5), legend.position = "none")
# PR AUC
pr_auc <- ggplot(df, aes(x = year, y = auc.pr, group = Model)) +
geom_line(aes(linetype = Model, color = Model), lwd = 0.75, alpha = 0.75) +
scale_color_manual(values = c("#F8766D", "#00BFC4")) +
scale_linetype_manual(values = c("dashed", "solid")) +
theme_bw() +
labs(title="Precision-Recall", x = "Year", y = "AUC") +
theme(plot.title = element_text(hjust = 0.5), legend.box.background = element_rect(colour = "black"))
# Load required packages
library(gridExtra)
library(grid)
library(cowplot)
# Render two subplots side-by-side
plot_grid(roc_auc, pr_auc, nrow = 1, rel_widths = c(0.4, 0.6))

Social Network Analysis using Stochastic Actor-Oriented Models (SAOM)