Statistical Learning: Modeling Social Networks

Rendering Social Networks with igraph

Visualizing graphs and analyzing networks using \(\textsf{igraph}\).

#install.packages(c("igraph", "igraphdata"))
# load packages
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(igraphdata)

# Let's first create an undirected network consists of three nodes and three edges using igraph
g1 <- make_graph(c(1,2, 2,3, 3,1), n = 3, directed = FALSE)   # Note how edge list c() are defined 
class(g1)

## [1] "igraph"

plot(g1)

# n (the number of nodes) can be greater than the number of edges in the edge list, meaning that many nodes are unconnected

g2 <- make_graph( edges=c(1,2, 2,3, 3,1), n = 10) # now with 10 vertices, and directed by default
plot(g2)

# Name the nodes 
g3 <- make_graph(c("A","B", "B","C", "C","A"))   # `n` will be ignored for edge list with vertex names
# When the edge list has vertex names, the number of nodes is not needed
plot(g3)

# Named nodes without edges --> 'isolates'

g4 <- make_graph( c("A", "B", "B", "C", "B", "C", "A", "A"), isolates=c("D", "E", "F", "G")) 

set.seed(123)
plot(g4, edge.arrow.size = .5, 
     vertex.color = "yellow", 
     vertex.frame.color = "gray", 
     vertex.label.color = "black", 
     vertex.size = 8, 
     vertex.label.cex = 1.5, 
     vertex.label.dist = 2,   # The distance of the label from the center of the node
     edge.curved = 0.2)

# You can also use graph_from_literal() as a handy way to quickly create small graphs

# Undirected graph
plot(graph_from_literal(A-B, B-C))  # Two edges: node B to A, node B to C

# Directed graph
plot(graph_from_literal(A+-B, B-+C))

# Note: the number of dashes doesn't matter
# Use - for undirected edges, +- or -+ for directed edges, + means the direction pointing left or right. Use ++ for a symmetric relationships, and use : for sets of nodes, that is, set ABC to set DEF

plot(graph_from_literal(A-+B:C:D))

# Get edge, node and network attributes
# Plot a network

g5 <- make_graph( c("A", "B", "B", "C", "H", "C", "A", "E", "D", "C", "B", "E"), 
             isolates=c("F", "G"), directed = TRUE)  
plot(g5)

# View the igraph object as an adjacency matrix
as_adjacency_matrix(g5)

## 8 x 8 sparse Matrix of class "dgCMatrix"
##   A B C H E D F G
## A . 1 . . 1 . . .
## B . . 1 . 1 . . .
## C . . . . . . . .
## H . . 1 . . . . .
## E . . . . . . . .
## D . . 1 . . . . .
## F . . . . . . . .
## G . . . . . . . .

# View as a sparse matrix
as_adjacency_matrix(g5, sparse = FALSE)

##   A B C H E D F G
## A 0 1 0 0 1 0 0 0
## B 0 0 1 0 1 0 0 0
## C 0 0 0 0 0 0 0 0
## H 0 0 1 0 0 0 0 0
## E 0 0 0 0 0 0 0 0
## D 0 0 1 0 0 0 0 0
## F 0 0 0 0 0 0 0 0
## G 0 0 0 0 0 0 0 0

# g5[4, ]   # Find Node H's ties with other nodes
# add edge g5['A','H'] = 1

# Get edge list
E(g5)

## + 6/6 edges from 4e6fa6a (vertex names):
## [1] A->B B->C H->C A->E D->C B->E

# Count the number of edges
ecount(g5)

## [1] 6

# Get node (vertex) list
V(g5)

## + 8/8 vertices, named, from 4e6fa6a:
## [1] A B C H E D F G

# Count the number of nodes (vertices)
vcount(g5)

## [1] 8

# Find the neighbors of A
V(g5)[.nei("A")]

## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E

# Find the neighbors of A and B
g5[[c("A","B")]]

## $A
## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E
## 
## $B
## + 2/8 vertices, named, from 4e6fa6a:
## [1] C E

# Alternatively, you can use
neighbors(g5,"A")

## + 2/8 vertices, named, from 4e6fa6a:
## [1] B E

# Find nodes that D is directed to
E(g5)[.from("D")]

## + 1/6 edge from 4e6fa6a (vertex names):
## [1] D->C

# Are F and H tied?
E(g5)["F" %--% "H"]

## + 0/6 edges from 4e6fa6a (vertex names):

# Find the tie that goes from F to H
E(g5)["F" %->% "H"]

## + 0/6 edges from 4e6fa6a (vertex names):

# Find nodes having ties directed to B
E(g5)[.to("B")]

## + 1/6 edge from 4e6fa6a (vertex names):
## [1] A->B

# Get node attributes
V(g5)$name

## [1] "A" "B" "C" "H" "E" "D" "F" "G"

# Add node attributes, for example. country name
V(g5)$country <- c("USA", "UKG", "AUS", "JPN", "FRN", "DEU", "NOR", "RUS")

# Add edge attributes, let's add numerical values to denote the importance of an edge, such as trade volume. We have a total of 6 edges: length(E(g5))
E(g5)$type <- "Million USD" # Applied to all edges
E(g5)$value <- c(25, 10, 7, 8, 4, 1)

# Set these new attributes
g5 <- set_graph_attr(g5, "type", "value")

# Assign value as edge weights
E(g5)$weight <- E(g5)$value
is_weighted(g5)

## [1] TRUE

# See node and edge attributes
vertex_attr(g5)

## $name
## [1] "A" "B" "C" "H" "E" "D" "F" "G"
## 
## $country
## [1] "USA" "UKG" "AUS" "JPN" "FRN" "DEU" "NOR" "RUS"

edge_attr(g5)

## $type
## [1] "Million USD" "Million USD" "Million USD" "Million USD" "Million USD"
## [6] "Million USD"
## 
## $value
## [1] 25 10  7  8  4  1
## 
## $weight
## [1] 25 10  7  8  4  1

# You can also remove (delete) certain attributes: g5 <- delete_graph_attr(g5, "value")

# Quickly check basic properties
g5

## IGRAPH 4e6fa6a DNW- 8 6 -- 
## + attr: type (g/c), name (v/c), country (v/c), type (e/c), value (e/n),
## | weight (e/n)
## + edges from 4e6fa6a (vertex names):
## [1] A->B B->C H->C A->E D->C B->E

# Serial number and letters refer to descriptive details of this graph object g5: U means this is an undirected graph; D means it is directed graph; N indicates it is a named graph.
# '--' refers to attributes not applicable to g:
# W would refer to a weighted graph --> edges have a weight attribute
# B would refer to a bipartite graph --> vertices have a type attribute
# 8 refers to the number of nodes (vertices) in g, 6 refers to the number of edges in g
# (g/c) means graph/character, (v/c) vertex/character, (e/n) edge/numeric, (e/c) edge/character


## Network Measures

# list degree centrality
degree(g5, v = V(g5),  # The degree of a node records the number of its adjacent edges.
      mode = c("all", "out", "in", "total"), 
      loops = TRUE, normalized = FALSE)

## A B C H E D F G 
## 2 3 3 1 2 1 0 0

# in-degree
degree(g5, mode="in")  # How many edges are pointing to a particular node

## A B C H E D F G 
## 0 1 3 0 2 0 0 0

# out-degree
degree(g5, mode="out")  # How many edges are from a particular node

## A B C H E D F G 
## 2 2 0 1 0 1 0 0

# Eigenvector Centrality
# Eigenvector centrality is a measure of the influence of a node in a network. Scores are assigned to other nodes in the network based on the idea that connections to high-scoring nodes contribute more to the score of the node than to low-scoring nodes. A high eigenvector score means that a node is connected to many nodes who themselves have high scores.

evcent(g5)$vector

## Warning: `evcent()` was deprecated in igraph 2.0.0.
## ℹ Please use `eigen_centrality()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##            A            B            C            H            E            D 
## 9.685752e-01 1.000000e+00 3.836254e-01 9.469184e-02 3.084933e-01 5.410962e-02 
##            F            G 
## 8.026701e-20 8.026701e-20

# Hubs and Authorities: similar measures to Eigenvector centrality

hub.score(g5)$vector

## Warning: `hub.score()` was deprecated in igraph 2.0.0.
## ℹ Please use `hits_scores()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##            A            B            C            H            E            D 
## 1.0000000000 0.0138481147 0.0000000000 0.0015531986 0.0000000000 0.0008875421 
##            F            G 
## 0.0000000000 0.0000000000

authority.score(g5)$vector

## Warning: `authority.score()` was deprecated in igraph 2.0.0.
## ℹ Please use `hits_scores()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##           A           B           C           H           E           D 
## 0.000000000 1.000000000 0.006116148 0.000000000 0.320553925 0.000000000 
##           F           G 
## 0.000000000 0.000000000

# Closeness
# Calculated as the reciprocal of the sum of the length of the shortest paths between the node and all other nodes in the graph. The more central a node is, the closer it is to all other nodes.

closeness(g5)

##          A          B          C          H          E          D          F 
## 0.01470588 0.09090909        NaN 0.14285714        NaN 0.25000000        NaN 
##          G 
##        NaN

# Betweeness
# A measure of how often a node is on the shortest path between other nodes in the network. The higher the measure, the more important a node is.

betweenness(g5)

## A B C H E D F G 
## 0 1 0 0 0 0 0 0

# Generate empty graph
empty_g <- make_empty_graph(20)
plot(empty_g, vertex.size = 5, vertex.label=NA)

# Generate full graph
full_g <- make_full_graph(20)
plot(full_g, vertex.size = 5, vertex.label = NA)

# Star graph
star_g <- make_star(20)
plot(star_g, vertex.size = 5, vertex.label = NA)

# Generate scale-free Barabasi networks
bara_g <- sample_pa(200)  # You can also use barabasi.game(200)

plot(degree_distribution(bara_g), xlab = "Node", ylab = "Degree")  # Does degree distribution follow power law?

# Erdos-Renyi random graph
erdo_g <- sample_gnm(n = 20, m = 20) # You can also use erdos.renyi.game. n is the number of nodes, m the number of edges
## Options include directed = and loops =
plot(erdo_g, vertex.size = 5, vertex.label = NA)

# Generate a graph using graph_from_adjacency_matrix()
set.seed(123)
# Generate a sparse matrix by sampling from Bernoulli distribution of size 64. 
adj_m <- matrix(sample(0:1, 64, replace = TRUE, prob = c(0.9, 0.1)),
                     nc = 8) # Number of columns

adj_m

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    0    0    0    0    0    0    0    0
## [2,]    0    0    0    0    0    0    0    0
## [3,]    0    1    0    0    0    0    0    0
## [4,]    0    0    1    0    0    0    0    0
## [5,]    1    0    0    0    0    0    0    0
## [6,]    0    0    0    0    0    0    0    0
## [7,]    0    0    0    1    0    0    0    0
## [8,]    0    0    1    1    0    0    0    0

g_adj <- graph_from_adjacency_matrix(adj_m,
                  diag = FALSE, # If this is FALSE then the diagonal is zeroed out first
                  mode = "max") # Default is directed, the latest version requires one to set mode = "max" in lieu of "undirected"
plot(g_adj)

## Generate graph from uploaded data (usually stored in edgelist format) using graph_from_edgelist()


# install.packages("remotes")
# remotes::install_github("RandiLGarcia/dyadr")

library(dyadr)  # We'll be using the dyadic_trade data from Randi Garcia's dyadr package https://rdrr.io/github/RandiLGarcia/dyadr/man/dyadic_trade.html
data(dyadic_trade)

# Take a look at the data
head(dyadic_trade)

##   ccode1 ccode2 year                importer1 importer2  flow1  flow2 source1
## 1      2     20 1920 United States of America    Canada 611.86 735.48       1
## 2      2     20 1921 United States of America    Canada 335.44 442.99       1
## 3      2     20 1922 United States of America    Canada 364.02 502.84       1
## 4      2     20 1923 United States of America    Canada 416.00 598.14       1
## 5      2     20 1924 United States of America    Canada 399.14 496.32       1
## 6      2     20 1925 United States of America    Canada 473.72 608.35       1
##   source2
## 1       1
## 2       1
## 3       1
## 4       1
## 5       1
## 6       1

# Subset (retrieve data from year 2009)
trade_df <- dyadic_trade[dyadic_trade$year == 2009, ]

# Data wrangling
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.4.3

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:igraph':
## 
##     crossing

d_trade = trade_df %>% dplyr::select(importer1, importer2, flow1) %>%
  group_by(importer1, importer2) %>%  # Organize the data into dyad format with 3 columns
  summarise(USD_m = sum(flow1))

## `summarise()` has grouped output by 'importer1'. You can override using the
## `.groups` argument.

# Convert to dataframe and remove NAs
d_trade <- as.data.frame(d_trade)
d_trade <- d_trade[d_trade$importer1 != "", ]
d_trade <- d_trade[d_trade$importer2 != "", ]
d_trade <- d_trade[d_trade$USD_m != "", ]

head(d_trade)

##     importer1  importer2  USD_m
## 2 Afghanistan  Australia  25.27
## 3 Afghanistan Bangladesh   4.56
## 4 Afghanistan     Brunei   0.03
## 5 Afghanistan   Cambodia   1.98
## 6 Afghanistan      China 239.26
## 7 Afghanistan       Fiji   0.00

# Randomly sample 100 obs
set.seed(123)
d_trade <- sample_n(d_trade, 100)

# Convert to matrix
trade_m <- as.matrix(d_trade[, 1:2])
# Rescale trade volume (million USD), it will be used to represent edge width
d_trade$USD_m <- d_trade$USD_m / 80

trade_g <- graph_from_edgelist(trade_m, directed = FALSE)
plot(trade_g, 
     vertex.size = 6, 
     edge.width = d_trade[,3],  # Set edge.width proportional to bilateral trade volume 
     vertex.color = "coral")

## Alternatively, you can also generate a network via graph_from_data_frame() that loads nodes (vertices) and edges separately. Here's my example: International Armament Collaboration data

# Load required packages
library(foreign)

# Load the data
vertices <- read.csv(url("https://www.dropbox.com/s/j1o1wh5w2ucvkje/vertices.csv?dl=1"), header = TRUE)
edges <- read.csv(url("https://www.dropbox.com/s/p78ndqv4olktifu/IAC_edges.csv?dl=1"), header = TRUE)

# Recode and generate color labels
is.na(vertices$alliance) <- vertices$alliance == ""
vertices$c_color <- c("white","orange","red","olivedrab1")[vertices$alliance] # Partners' alliance status
vertices$m_color <- ifelse(vertices$mutual_defense > 0,'black','grey80') # If a partner holds mutual defense treaty with the U.S.

color_alliance <- vertices %>%
  group_by(alliance, c_color) %>%
  summarise(n = n()) %>%
  filter(!is.na(c_color))

## `summarise()` has grouped output by 'alliance'. You can override using the
## `.groups` argument.

color_defense <- vertices %>%
  group_by(mutual_defense, m_color) %>%
  summarise(n = n()) %>%
  filter(!is.na(m_color))

## `summarise()` has grouped output by 'mutual_defense'. You can override using
## the `.groups` argument.

# Convert into graph format
g <- graph_from_data_frame(d = edges, vertices = vertices, directed = FALSE)

# Mapping
V(g)$color[V(g)$alliance == "NATO"] = "red"
V(g)$color[V(g)$alliance == "MNNA"] = "orange"
V(g)$color[V(g)$alliance == "USA"] = "olivedrab1"
w1 <- E(g)$IAC  # Number of IAC cases with the U.S.
w1 <- log(w1)  # Log transform
col = ifelse(V(g)$mutual_defense > 0,'black','grey80')
line_type <- ifelse(!is.na(E(g)$mutual_defense), 1, 2)
m1 <- layout_nicely(g)
V(g)$vertex_degree <- igraph::degree(g)

# Plotting
par(mar=c(1,1,1,1))
plot(g,
     vertex.size = 7, 
     vertex.label.color = "black", 
     vertex.frame.color = col,
     vertex.label.cex = 0.4,
     edge.color = 'grey30',
     edge.width = w1+1,
     edge.lty = line_type,
     edge.arrow.size = 0.6,
     layout = m1)

legend("topleft", legend = c("Alliance type", as.character(color_alliance$alliance), "Mutual defense", as.character(color_defense$mutual_defense)), pch = c(1, 19, 19, 19, 19, 1), col = c(NA, color_alliance$c_color, NA, color_defense$m_color), pt.cex = 1, cex = 0.7, bty = "n", ncol = 1, title = "")
legend("topleft", legend = "", cex = 0.9, bty = "n", ncol = 1,
       title = "Nature of Security Ties")

Social Network Analysis using Stochastic Actor-Oriented Models (SAOM)

# Load required packages
library(RSiena)
library(statnet)
library(igraph)
library(dplyr)


# Load FTA matrix data which record global FTA ties between 1998 and 2019
FTA <- readRDS(url("https://www.dropbox.com/scl/fi/6luyfnrur7ictisun8zxh/FTA19982019.rds?rlkey=h6af5x79e318vm6q7rj8b8loc&st=bpi38ulh&dl=1"))

# Read in country name data, you can see a list of country name, abbreviation, and 3-digit iso code. We need them to index countries in our data 
cid <- read.csv(url("https://www.dropbox.com/scl/fi/7isucibr6woxr5gnlthu8/countryid.csv?rlkey=cezf0zg1vbkcvxrrvazzu4pjf&st=uxw2yxdw&dl=1")) 

# Convert to matrix format
for(i in 1:length(FTA)){
    FTA[[i]] <- as.matrix(FTA[[i]]) 
    # Set country name and transform the adjacency FTA matrix into binary network form 
    colnames(FTA[[i]]) <- cid$comtrade_id
    rownames(FTA[[i]]) <- cid$comtrade_id
}

length(FTA)

## [1] 22

## Stacking FTA outcome network into a multidimensional array. We need FTA matrix from the last five years (2015 to 2019, 22 years!)
FTA2019 <- FTA[[22]]
FTA2018 <- FTA[[21]]
FTA2017 <- FTA[[20]]
FTA2016 <- FTA[[19]]
FTA2015 <- FTA[[18]]

# Quick question. Is global free trade on a secular rise? Let Jaccard index be our guide. Jaccard index is a statistic used for gauging the similarity between two count data. It is defined as: N11 / (N01 + N10 + N11), using FTA_change1819 for illustration, N11 is the number of FTAs existed in both periods (4375), N01 is the number of FTAs that were not in period 1 but were in period 2 (68) and N10 is the number of FTAs that were in period 1 but not in period 2 (0).

FTA_change1819 <- table(FTA[[18]], FTA[[19]])
FTA_change1819  # Similarity in global FTA networks between 2018 and 2019

##    
##         0     1
##   0 21546    68
##   1     0  4307

4307/(4307+68)

## [1] 0.9844571

FTA_change1920 <- table(FTA[[19]], FTA[[20]])
FTA_change1920  # Similarity in global FTA networks between 2019 and 2020

##    
##         0     1
##   0 21008   538
##   1     0  4375

4375/(4375+538)

## [1] 0.8904946

FTA_change2021 <- table(FTA[[20]], FTA[[21]])
FTA_change2021  # Similarity in global FTA networks between 2020 and 2021

##    
##         0     1
##   0 20994    14
##   1     2  4911

4911/(4911+14+2)

## [1] 0.9967526

FTA_change2122 <- table(FTA[[21]], FTA[[22]])
FTA_change2122  # Similarity in global FTA networks between 2021 and 2022

##    
##         0     1
##   0 20866   130
##   1    12  4913

4913/(4913+130+12)

## [1] 0.971909

# Well, it seems that the evolution of global free trade is relatively stable.


# Check row and column dimension: is it a square matrix?
nrow(FTA[[2]]) == ncol(FTA[[22]])   # Yes!

## [1] TRUE

net_size = nrow(FTA[[2]])

## Create a FTA array: five years, 161 by 161 observations in each year
FTA_array <- array(c(FTA2019 = FTA[[22]], FTA2019 = FTA[[21]], FTA2019 = FTA[[20]], FTA2019 = FTA[[19]], FTA2019 = FTA[[18]]), 
                     dim = c(net_size, net_size, 5))

dim(FTA_array)

## [1] 161 161   5

## Create SIENA dependent variable network: one-mode network (all edge values are the same, either 0 or 1)
FTA_networks <- sienaDependent(netarray = FTA_array, 
                                   type = "oneMode", 
                              allowOnly = FALSE)   # Allow the dependent variable to both increase AND decrease

## Load country attributes data. You can see a list of country-level covariates, they will be used as regressors.
country_attr <- readRDS(url("https://www.dropbox.com/scl/fi/ddhf2z5tl9bo8l9jo29vb/country_attr.rds?rlkey=5upfqxcljtuphtuvingnptptm&st=9bc9dddy&dl=1"))

# Take a look at the data, just your average-looking data.frame object
names(country_attr)

##  [1] "vertex.names"  "cname"         "year"          "population"   
##  [5] "industrial_va" "service_va"    "trade_open"    "polity"       
##  [9] "execrlc"       "liec"          "bound_rate"    "applied_rate" 
## [13] "binding"       "kaopen"        "ipr"           "binding_over" 
## [17] "lGDP"

head(country_attr)

##   vertex.names cname year population industrial_va service_va trade_open polity
## 1            4   AFG 2015   17.34187      22.12404   53.23529    55.3462     10
## 2            8   ALB 2015   14.88022      21.76366   46.28435    71.8010     20
## 3           24   AGO 2015   17.14772      41.93309   48.65989    62.8885      9
## 4           28   ATG 2015   11.44642      16.45798   69.88088    93.6295     21
## 5           32   ARG 2015   17.57671      23.15306   55.81490    22.4862     20
## 6           51   ARM 2015   14.88022      25.70870   48.21267    71.5933     16
##   execrlc liec bound_rate applied_rate binding kaopen          ipr binding_over
## 1       0    7       0.00         5.47     0.0     -2 0.0001958417        -5.47
## 2      -1    7       6.79         1.11   100.0      0 0.0040150162         5.68
## 3      -1    6      59.08         9.38   100.0     -2 0.0078045620        49.70
## 4      -1    6      58.75        12.03    97.5      1 0.0036434520        46.72
## 5      -1    7      31.74         7.35   100.0     -2 0.0265864835        24.39
## 6       0    7       8.68         2.93   100.0      2 0.0000000000         5.75
##         lGDP
## 1 0.00000000
## 2 0.00000000
## 3 0.00000000
## 4 0.00000000
## 5 0.04520129
## 6 0.04498265

## Create time-varying variables: need to reshape the original data.frame to wide format AND in matrix form
# If you want to use a constant (time-invariant or fixed) variable, then use coCovar()
## Notations ##
#cc = coCovar
#vc = varCovar
#cdc = coDyadCovar
#vdc = varDyadCovar


library(reshape)

# Population
pop <- reshape(country_attr[, c(1, 4, 3)], idvar = "vertex.names", timevar = "year", direction = "wide")
pop_vc <- varCovar(as.matrix(pop[,-1]))  # Make it into time-varying covariate, -1 means remove country id column

# Trade openness
trade_openness <- reshape(country_attr[, c(1, 7, 3)], idvar = "vertex.names", timevar = "year", direction = "wide")
to_vc <- varCovar(as.matrix(trade_openness[,-1])) # Also make it into time-varying covariate

## Create "dyadic" time-varying variables
# First, we need dyadic trade volume data between 2015 and 2019, recorded as log of million USD

trade2015 <- readRDS(url("https://www.dropbox.com/scl/fi/17i8e42xzhgo9dqdmzohn/trade2015.rds?rlkey=5ldvbtn4550vwrbobsu5lfuhx&st=ipp306c7&dl=1"))
trade2016 <- readRDS(url("https://www.dropbox.com/scl/fi/bevq047cg7q6p96qk8rk3/trade2016.rds?rlkey=x19pybshmqf45aqclc8audnqj&st=qn68qrog&dl=1"))
trade2017 <- readRDS(url("https://www.dropbox.com/scl/fi/1a2wvu3o28ur5ce462blr/trade2017.rds?rlkey=izxqr82e8iirr9h73b48i3yfx&st=gavsra6g&dl=1"))
trade2018 <- readRDS(url("https://www.dropbox.com/scl/fi/tvqyzcgqvkfo7kxpkn9j8/trade2018.rds?rlkey=sibcdornmgwzznb5myjfaeqju&st=bnssd0n7&dl=1"))
trade2019 <- readRDS(url("https://www.dropbox.com/scl/fi/gxustrrcx6kvw9137uttd/trade2019.rds?rlkey=jjvgc0ju17465bv6e9jjnebu3&st=cpgy2cry&dl=1"))

# Take a step back and select global FTA network in 2015 for visualization

g2015 <- graph_from_adjacency_matrix(FTA[[18]])  # Plot the FTA network in year 2015
g2015_edgelist <- as_edgelist(g2015)   # Convert adjacent matrix to edgelist format

# Subset country attributes in year 2015
year2015_attributes <- country_attr[country_attr$year == 2015, ]

# Fill in dyadic trade volume data using the trade matrix loaded earlier


trade2015_df <- as.data.frame(as.table(trade2015))
trade2015_df <- trade2015_df[, c(2, 1, 3)]  # Reorganize columns
names(trade2015_df) <- c("exporter", "importer", "trade_vol")

# Create an igraph object for plotting
year2015_net <- graph_from_data_frame(d = g2015_edgelist, directed = TRUE,
                                  vertices = year2015_attributes)
# Setting layout of the graph
layout <- layout.fruchterman.reingold(year2015_net)

## Warning: `layout.fruchterman.reingold()` was deprecated in igraph 2.1.0.
## ℹ Please use `layout_with_fr()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Plotting
plot(year2015_net, 
     vertex.label = country_attr[,2],   # Label with country abbreviation
     vertex.size = 10, 
     vertex.label.cex = 0.5, # Node label (country abbreviation) text size
     layout = layout, 
     edge.arrow.size = .2, 
    edge.arrow.width = trade2015_df[,3],  # Edge width proportional to dyadic trade volume
      edge.color="light gray",
      vertex.frame.color = NA)

# Back to RSiena pack and continue to create dyadic variables
# varDyadCovar() means time-varying dyadic variables

# For a data array of n by n by m periods, only m-1 periods of data will be used. The last array will be discarded (you cannot use data from the m-th period to predict the network formed in the same period)


trade_Dyadvar <- varDyadCovar(array(c(trade2015, trade2016, trade2017, trade2018), dim = c(161, 161, 4)))

FTA_covar <- sienaDataCreate(FTA_networks, pop_vc, to_vc, trade_Dyadvar)  # The first argument is your network, the rest are your node and edge variables

# After building the model. We first get the base terms using getEffects() function, they are the covariates whose effects are to be estimated.
siena_effects <- getEffects(FTA_covar)

# We can use includeEffects() to add additional variables to the base model. For example, do countries tend to extend FTA ties to other countries that are already tied to its immediate neighbors? This term is called "transTrip" in RSiena. We can add that to the model via includeEffects().

siena_effects <- includeEffects(siena_effects,  #  siena object of effects
                                transTrip,   # Name of the term to be included
                                name = "FTA_networks")  # Name of the dependent variable for which effects are being included

##   effectName          include fix   test  initialValue parm
## 1 transitive triplets TRUE    FALSE FALSE          0   0

# Set the effect of "population" as alter, altX, meaning that do alters (other countries) tend to extend FTAs to more populous focal countries (ego)?

siena_effects <- includeEffects(siena_effects, altX, 
                                interaction1 = "pop_vc")

##   effectName   include fix   test  initialValue parm
## 1 pop_vc alter TRUE    FALSE FALSE          0   0

# Also, do countries (ego) having higher level of trade openness tend to extend FTAs to other countries (alters)?

siena_effects <- includeEffects(siena_effects, egoX, 
                                interaction1 = "to_vc")

##   effectName include fix   test  initialValue parm
## 1 to_vc ego  TRUE    FALSE FALSE          0   0

# includeEffects() function is also necessary to set up the effects of covariates from an actor-oriented perspective, that is, from the perspective of the ego to alters or the other way around.

siena_effects <- includeEffects(siena_effects, X,  # X for dyadic effect, check effectsDocumentation()
                                interaction1 = "trade_Dyadvar")

##   effectName    include fix   test  initialValue parm
## 1 trade_Dyadvar TRUE    FALSE FALSE          0   0

# Model setting
# Before we estimate our model, we need to create an object of input specifications using the sienaAlgorithmCreate() function. For pedagogical purpose, I'll set most things at their default values. We'll also set seed = 123 for replication purpose.

input_options <- sienaAlgorithmCreate(projname = "FTA_model",   # Model name (you can name it whatever you like)
                                      # maxlike = TRUE,  # Whether to use maximum likelihood method or Method of Moments estimation
                                      # nsub = 5, # Number of iterations in phase 2
                                      #   n3 = 200,  # Number of iterations in phase 3
                                      seed = 123)

## If you use this algorithm object, siena07 will create/use an output file FTA_model.txt .

# We can now estimate our model using siena07() function

FTA.mod <- siena07(x = input_options, 
                data = FTA_covar,   # Variable list
                 effects = siena_effects,   # Effects to be estimated (from whose perspective)
                    batch= TRUE, 
                 verbose = FALSE, 
              returnDeps = TRUE,  # Whether to return a list of simulated dependent variables (set to TRUE if you will be running sienaGOF() later)
              useCluster = TRUE,  # Use multiple clusters
                nbrNodes = 3)  # Number of processes to use if useCluster = TRUE

## 
## Start phase 0 
## theta: -0.11  0.00  0.00  0.00  0.00  0.00 
## 
## Start phase 1 
## Phase 1 Iteration 1 Progress: 0%
## Phase 1 Iteration 4 Progress: 0%
## Phase 1 Iteration 10 Progress: 0%
## Phase 1 Iteration 25 Progress: 0%
## Phase 1 Iteration 40 Progress: 1%
## theta: -0.88389 -0.07301  0.01653  0.00913  0.02597 -0.00187 
## 
## Start phase 2.1
## Phase 2 Subphase 1 Iteration 1 Progress: 14%
## Phase 2 Subphase 1 Iteration 2 Progress: 14%
## theta -1.08375 -0.09578  0.01783  0.01205  0.02774 -0.00202 
## ac 1.191 1.195 1.224 1.682 0.934 0.977 
## Phase 2 Subphase 1 Iteration 3 Progress: 14%
## Phase 2 Subphase 1 Iteration 4 Progress: 14%
## theta -1.72026 -0.17148  0.02239  0.02074  0.03027 -0.00244 
## ac 1.253 1.262 1.295 0.948 1.002 1.102 
## Phase 2 Subphase 1 Iteration 5 Progress: 14%
## Phase 2 Subphase 1 Iteration 6 Progress: 14%
## theta -2.41235 -0.25480  0.02832  0.02848  0.02129 -0.00312 
## ac 1.263 1.273 1.302 0.994 1.074 0.951 
## Phase 2 Subphase 1 Iteration 7 Progress: 15%
## Phase 2 Subphase 1 Iteration 8 Progress: 15%
## theta -2.85688 -0.28992  0.03095  0.03132  0.00743 -0.00384 
## ac 1.264 1.273 1.301 1.224 1.097 0.915 
## Phase 2 Subphase 1 Iteration 9 Progress: 15%
## Phase 2 Subphase 1 Iteration 10 Progress: 15%
## theta -3.16447 -0.28105  0.03221  0.03270 -0.00153 -0.00538 
## ac 1.264 1.273 1.301 0.872 1.097 0.740 
## Phase 2 Subphase 1 Iteration 200 Progress: 23%
## theta -8.52867 -0.25979  0.03401  0.03357 -0.00562 -0.11360 
## ac 1.25589 1.21618 1.25405 0.00663 0.93627 0.71015 
## theta -6.07499 -0.24566  0.03204  0.03265 -0.00384 -0.06836 
## ac  1.2558  1.2142  1.2498 -0.0174  0.9169  0.7007 
## theta: -6.07499 -0.24566  0.03204  0.03265 -0.00384 -0.06836 
## 
## Start phase 2.2
## Phase 2 Subphase 2 Iteration 1 Progress: 23%
## Phase 2 Subphase 2 Iteration 2 Progress: 23%
## Phase 2 Subphase 2 Iteration 3 Progress: 23%
## Phase 2 Subphase 2 Iteration 4 Progress: 23%
## Phase 2 Subphase 2 Iteration 5 Progress: 23%
## Phase 2 Subphase 2 Iteration 6 Progress: 23%
## Phase 2 Subphase 2 Iteration 7 Progress: 23%
## Phase 2 Subphase 2 Iteration 8 Progress: 23%
## Phase 2 Subphase 2 Iteration 9 Progress: 23%
## Phase 2 Subphase 2 Iteration 10 Progress: 23%
## Phase 2 Subphase 2 Iteration 200 Progress: 31%
## theta -8.01813 -0.25968  0.03161  0.03531 -0.00399 -0.10484 
## ac  0.8557  0.5867  0.6490 -0.1304  0.0155  0.5001 
## theta -7.14941 -0.25226  0.03211  0.03499 -0.00477 -0.08945 
## ac  0.853  0.603  0.629 -0.108  0.030  0.502 
## theta: -7.14941 -0.25226  0.03211  0.03499 -0.00477 -0.08945 
## 
## Start phase 2.3
## Phase 2 Subphase 3 Iteration 1 Progress: 32%
## Phase 2 Subphase 3 Iteration 2 Progress: 32%
## Phase 2 Subphase 3 Iteration 3 Progress: 32%
## Phase 2 Subphase 3 Iteration 4 Progress: 32%
## Phase 2 Subphase 3 Iteration 5 Progress: 32%
## Phase 2 Subphase 3 Iteration 6 Progress: 32%
## Phase 2 Subphase 3 Iteration 7 Progress: 33%
## Phase 2 Subphase 3 Iteration 8 Progress: 33%
## Phase 2 Subphase 3 Iteration 9 Progress: 33%
## Phase 2 Subphase 3 Iteration 10 Progress: 33%
## Phase 2 Subphase 3 Iteration 200 Progress: 41%
## theta -8.14842 -0.24740  0.03297  0.03598 -0.00547 -0.10653 
## ac 0.9588 0.6807 0.7299 0.1104 0.0436 0.3567 
## theta -7.83950 -0.25418  0.03291  0.03512 -0.00458 -0.10128 
## ac 0.9460 0.6786 0.6680 0.1003 0.0592 0.4088 
## theta: -7.83950 -0.25418  0.03291  0.03512 -0.00458 -0.10128 
## 
## Start phase 2.4
## Phase 2 Subphase 4 Iteration 1 Progress: 43%
## Phase 2 Subphase 4 Iteration 2 Progress: 44%
## Phase 2 Subphase 4 Iteration 3 Progress: 44%
## Phase 2 Subphase 4 Iteration 4 Progress: 44%
## Phase 2 Subphase 4 Iteration 5 Progress: 44%
## Phase 2 Subphase 4 Iteration 6 Progress: 44%
## Phase 2 Subphase 4 Iteration 7 Progress: 44%
## Phase 2 Subphase 4 Iteration 8 Progress: 44%
## Phase 2 Subphase 4 Iteration 9 Progress: 44%
## Phase 2 Subphase 4 Iteration 10 Progress: 44%
## Phase 2 Subphase 4 Iteration 200 Progress: 52%
## theta -8.31718 -0.25217  0.03233  0.03451 -0.00405 -0.10981 
## ac 1.0194 0.7391 0.7676 0.0564 0.1617 0.4991 
## theta -8.31899 -0.25168  0.03263  0.03533 -0.00451 -0.10963 
## ac 0.9752 0.6915 0.7124 0.0302 0.1586 0.5069 
## theta: -8.31899 -0.25168  0.03263  0.03533 -0.00451 -0.10963 
## 
## Start phase 3 
## Phase 3 Iteration 1 Progress 59%
## Phase 3 Iteration 301 Progress 72%
## Phase 3 Iteration 601 Progress 84%
## Phase 3 Iteration 901 Progress 96%

# Check out the results. The estimated FTA.mod object include a column of estimated coefficients and simulated standard errors 
summary(FTA.mod)

## Estimates, standard errors and convergence t-ratios
## 
##                                    Estimate   Standard   Convergence 
##                                                 Error      t-ratio   
## 
## Rate parameters: 
##   0.1      Rate parameter period 1  0.8846  (  0.0751  )             
##   0.2      Rate parameter period 2  0.0995  (  0.0258  )             
##   0.3      Rate parameter period 3  3.3593  (  0.1444  )             
##   0.4      Rate parameter period 4  0.4222  (  0.0526  )             
## 
## Other parameters: 
##   1.  eval outdegree (density)     -8.3190  ( 37.5958  )    1.4478   
##   2.  eval reciprocity             -0.2517  (  0.1700  )    0.3059   
##   3.  eval transitive triplets      0.0326  (  0.0085  )    0.6765   
##   4.  eval trade_Dyadvar            0.0353  (  0.0419  )   -0.0119   
##   5.  eval pop_vc alter            -0.0045  (  0.0283  )   -0.0087   
##   6.  eval to_vc ego               -0.1096  (  0.6371  )    0.4898   
## 
## Overall maximum convergence ratio:    1.6278 
## 
## 
## Total of 2159 iteration steps.
## 
## Covariance matrix of estimates (correlations below diagonal)
## 
##     1413.446        4.833        0.005       -1.528        0.728       23.952
##        0.756        0.029        0.000       -0.005        0.002        0.082
##        0.017       -0.130        0.000        0.000        0.000        0.000
##       -0.969       -0.728       -0.048        0.002       -0.001       -0.026
##        0.684        0.515       -0.006       -0.684        0.001        0.012
##        1.000        0.756        0.019       -0.969        0.684        0.406
## 
## Derivative matrix of expected statistics X by parameters:
## 
##       43.031        8.673      475.559      -97.790        1.863    -2564.529
##        4.737      170.057      668.043       62.551        8.053       69.487
##      354.378      937.051    66191.595     4420.123     -188.983   -20171.928
##     -107.219       79.664     5960.394    20779.714      828.706     -438.250
##       -1.226        9.990       19.606     1016.502     2454.588    -2646.250
##    -2544.360     -539.067   -27643.006     7055.313     -136.653   153571.571
## 
## Covariance matrix of X (correlations below diagonal):
## 
##       75.656       15.212     1091.634     -118.628       12.191    -1895.675
##        0.093      351.888     3178.457      396.599        6.382     2123.378
##        0.234        0.316   288291.922    22827.959    -1991.753   120409.456
##       -0.065        0.101        0.202    44097.661     1403.566    -3050.761
##        0.028        0.007       -0.073        0.132     2565.054    -6922.330
##       -0.146        0.076        0.150       -0.010       -0.092  2227533.506

# It looks like transitive triplets significantly increase the probability of i-j FTA relative to with other countries, while the three dyadic and nodal attributes are not significant. In fact, their estimated coefficients are in the opposite directions.
# Estimated coefficients are reported in odds-ratio format, so they should be interpreted as exponentiated odds. For example, we see a coefficient for trade_Dyadvar of 0.0353. 
# This would suggest that the odds of an FTA from i being sent to j is exp(0.0353) - 1 times higher than being sent to k, assuming that i-j FTA exists but i-k FTA does not.
# You can also see how rate parameters vary across the four periods. The rate parameter is the lowest in period 2, meaning that FTA network changes less during this period, but the rate of change kind of accelerated in period 3.


# We're now ready to plot the goodness-of-fit statistics (gof), which compares deviations between the observed and simulated networks. It will calculate gof statistics for each period
# Low p-values would indicate a potential problem. Our result looks good.

gof1 <- sienaGOF(sienaFitObject = FTA.mod, 
              auxiliaryFunction = IndegreeDistribution,  # Returns a named vector, the distribution of the observed or simulated in-degrees for the values in levels.
                 verbose = TRUE,   # Whether to print immediate results
                    join = TRUE,   # Boolean: should sienaGOF test all periods individually (FALSE), or sum across periods (TRUE)
                 varName = "FTA_networks")

## Detected 1002 iterations and 1 group.

## Calculating auxiliary statistics for periods  1 2 3 4 .

##   Period 1

##   > Completed  100  calculations  > Completed  200  calculations  > Completed  300  calculations  > Completed  400  calculations  > Completed  500  calculations  > Completed  600  calculations  > Completed  700  calculations  > Completed  800  calculations  > Completed  900  calculations  > Completed  1000  calculations  > Completed  1002  calculations

##   Period 2

##   > Completed  100  calculations  > Completed  200  calculations  > Completed  300  calculations  > Completed  400  calculations  > Completed  500  calculations  > Completed  600  calculations  > Completed  700  calculations  > Completed  800  calculations  > Completed  900  calculations  > Completed  1000  calculations  > Completed  1002  calculations

##   Period 3

##   > Completed  100  calculations  > Completed  200  calculations  > Completed  300  calculations  > Completed  400  calculations  > Completed  500  calculations  > Completed  600  calculations  > Completed  700  calculations  > Completed  800  calculations  > Completed  900  calculations  > Completed  1000  calculations  > Completed  1002  calculations

##   Period 4

##   > Completed  100  calculations  > Completed  200  calculations  > Completed  300  calculations  > Completed  400  calculations  > Completed  500  calculations  > Completed  600  calculations  > Completed  700  calculations  > Completed  800  calculations  > Completed  900  calculations  > Completed  1000  calculations  > Completed  1002  calculations

plot(gof1)

## Calculate prediction accuracy
# Although RSiena primarily focuses on estimating parameters for network evolution models rather than direct prediction accuracy measures like classification accuracy or AUC. However, we can still assess prediction accuracy using methods based on the goodness-of-fit (GOF), which we just did, or by comparing observed versus simulated networks.

# Extract first simulated network in period 2, which is stored in $sims 
# The structure of sims is a nested list: sims[[run]][[group]][[dependent variable]][[period]]. If x$maxlike=TRUE and there is only one group (like our data here) and one period, then the structure is [[run]][[dependent variable]].

sim_net <- FTA.mod$sims[[1]][1]$Data1$FTA_networks$`2`  # This means first simulated network from data 1 (we only have one here) in period 2. The output is a 3-column edgelist (i, j, tie) = [node i, node j, i-j tie]

# Examine the first 10 observations
head(sim_net, 10)

##       [,1] [,2] [,3]
##  [1,]    1   10    1
##  [2,]    1   66    1
##  [3,]    1   89    1
##  [4,]    1  102    1
##  [5,]    1  110    1
##  [6,]    2    8    1
##  [7,]    2   12    1
##  [8,]    2   36    1
##  [9,]    2   37    1
## [10,]    2   39    1

# We now fill in edgelist values into a 161 by 161 matrix
simulated_network <- matrix(0, nrow = 161, ncol = 161)

for(i in 1:nrow(sim_net)){
    simulated_network[sim_net[i,1], sim_net[i,2]] = sim_net[i,3]
}

# Extract actual FTA network observed in period 2
observed_network <- FTA_covar$depvars[[1]][, , 2]  

# Compute accuracy
correct_predictions <- sum(simulated_network == observed_network) / length(observed_network)
cat("New FTAs prediction accuracy:", correct_predictions)

## New FTAs prediction accuracy: 0.9993442

Re-analyze FTA formation using static and temporal Exponential Random Graphs Models (ERGM)

# Load required packages
library(ergm)
library(btergm)  # For binary temporal ERGM
library(speedglm) # For the use of glm inside btergm()
library(statnet)
library(network)

# Load FTA data again
FTA <- readRDS(url("https://www.dropbox.com/scl/fi/6luyfnrur7ictisun8zxh/FTA19982019.rds?rlkey=h6af5x79e318vm6q7rj8b8loc&st=bpi38ulh&dl=1"))

# Load cid again for country indexing
cid <- read.csv(url("https://www.dropbox.com/scl/fi/7isucibr6woxr5gnlthu8/countryid.csv?rlkey=cezf0zg1vbkcvxrrvazzu4pjf&st=uxw2yxdw&dl=1")) 


# Pretty much the same, but this time, we need to tweak the data a bit by converting the data to network class via network()
for(i in 1:length(FTA)){
  colnames(FTA[[i]]) <- cid$comtrade_id
  rownames(FTA[[i]]) <- cid$comtrade_id
  FTA[[i]] <- network(as.matrix(FTA[[i]]), 
  matrix.type = 'adjacency',  # Convert to adjacency matrix
  ignore.eval = TRUE,
  directed = TRUE)  # Directed network
}

FTA <- FTA[18:22]  # Subset the last five years (2015-2019)

# Load dyadic trade data again
trade2015 <- readRDS(url("https://www.dropbox.com/scl/fi/17i8e42xzhgo9dqdmzohn/trade2015.rds?rlkey=5ldvbtn4550vwrbobsu5lfuhx&st=ipp306c7&dl=1"))
trade2016 <- readRDS(url("https://www.dropbox.com/scl/fi/bevq047cg7q6p96qk8rk3/trade2016.rds?rlkey=x19pybshmqf45aqclc8audnqj&st=qn68qrog&dl=1"))
trade2017 <- readRDS(url("https://www.dropbox.com/scl/fi/1a2wvu3o28ur5ce462blr/trade2017.rds?rlkey=izxqr82e8iirr9h73b48i3yfx&st=gavsra6g&dl=1"))
trade2018 <- readRDS(url("https://www.dropbox.com/scl/fi/tvqyzcgqvkfo7kxpkn9j8/trade2018.rds?rlkey=sibcdornmgwzznb5myjfaeqju&st=bnssd0n7&dl=1"))
trade2019 <- readRDS(url("https://www.dropbox.com/scl/fi/gxustrrcx6kvw9137uttd/trade2019.rds?rlkey=jjvgc0ju17465bv6e9jjnebu3&st=cpgy2cry&dl=1"))

## Combine annual dyadic trade into a list object
dyadic_trade <- list(trade2015, trade2016, trade2017, trade2018, trade2019)

## Load country attributes data (again)
country_attr <- readRDS(url("https://www.dropbox.com/scl/fi/ddhf2z5tl9bo8l9jo29vb/country_attr.rds?rlkey=5upfqxcljtuphtuvingnptptm&st=9bc9dddy&dl=1"))


# Setting nodal attributes within network objects using set.vertex.attribute() function, so that these variables will be attached to the array of network objects, FTA, as nodal attributes 

for(i in unique(country_attr$year)){
 population <- country_attr[country_attr$year == i, c("population")]
 trade_open <- country_attr[country_attr$year == i, c("trade_open")]   
 network::set.vertex.attribute(FTA[[(i-2014)]], value = population, attrname = "population")  # Use i - 2014 in order to sort the index in FTA list
 network::set.vertex.attribute(FTA[[(i-2014)]], value = trade_open, attrname = "trade_open") 
}


set.seed(123)

# Baseline model. select a single year (2015) and estimate tie probability of that year
FTA.ergm <- ergm(FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + nodecov("trade_open"))   # Here we use edgecov() and nodecov() to create edge and node covariates inside ergm() function

## Starting maximum pseudolikelihood estimation (MPLE):

## Obtaining the responsible dyads.

## Evaluating the predictor and response matrix.

## Maximizing the pseudolikelihood.

## Finished MPLE.

## Evaluating log-likelihood at the estimate.

summary(FTA.ergm)

## Call:
## ergm(formula = FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + 
##     nodecov("trade_open"))
## 
## Maximum Likelihood Results:
## 
##                            Estimate Std. Error MCMC % z value Pr(>|z|)    
## edgecov.dyadic_trade[[1]]  0.023837   0.002330      0  10.230   <1e-04 ***
## nodecov.population        -0.067472   0.001540      0 -43.819   <1e-04 ***
## nodecov.trade_open         0.001270   0.000184      0   6.898   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 35711  on 25760  degrees of freedom
##  Residual Deviance: 23158  on 25757  degrees of freedom
##  
## AIC: 23164  BIC: 23189  (Smaller is better. MC Std. Err. = 0)

# Wow, previously insignificant variables now become significant! ERGM does appear better at predicting ties.

# Adding network structural characteristics
# density: equivalent to "edges" or "istar(1)" or "ostar(1)" divided by n*(n-1).
# mutual: reciprocity term
# twopath: this term adds one statistic to the model, equal to the number of 2-paths in the network i->j->k
# Let's add the "twopath" term as an exploratory probe. Please do take a look at the verbose log of the Monte Carlo maximum likelihood (MCMLE) process

FTA_structural.ergm <- ergm(FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + nodecov("trade_open") + twopath, 
                    control = control.ergm(parallel = 4, # Use 4 CPU cores
              parallel.type = "PSOCK", 
            MCMC.samplesize = 5000))    # Number of network statistics, randomly drawn from a given distribution on the set of all networks, returned by the Metropolis-Hastings algorithm.

## Starting maximum pseudolikelihood estimation (MPLE):
## Obtaining the responsible dyads.
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 60:

## Warning: 'glpk' selected as the solver, but package 'Rglpk' is not available;
## falling back to 'lpSolveAPI'. This should be fine unless the sample size and/or
## the number of parameters is very big.

## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 3.3274.
## Estimating equations are not within tolerance region.
## Iteration 2 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.4189.
## Estimating equations are not within tolerance region.
## Iteration 3 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0406.
## Convergence test p-value: 0.0721. Not converged with 99% confidence; increasing sample size.
## Iteration 4 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0133.
## Convergence test p-value: 0.5063. Not converged with 99% confidence; increasing sample size.
## Iteration 5 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0228.
## Convergence test p-value: 0.1542. Not converged with 99% confidence; increasing sample size.
## Iteration 6 of at most 60:
## 1 Optimizing with step length 1.0000.
## The log-likelihood improved by 0.0219.
## Convergence test p-value: 0.0091. Converged with 99% confidence.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Fitting the dyad-independent submodel...
## Bridging between the dyad-independent submodel and the full model...
## Setting up bridge sampling...
## Using 16 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 .
## Bridging finished.
## 
## This model was fit using MCMC.  To examine model diagnostics and check
## for degeneracy, use the mcmc.diagnostics() function.

summary(FTA_structural.ergm)

## Call:
## ergm(formula = FTA[[1]] ~ edgecov(dyadic_trade[[1]]) + nodecov("population") + 
##     nodecov("trade_open") + twopath, control = control.ergm(parallel = 4, 
##     parallel.type = "PSOCK", MCMC.samplesize = 5000))
## 
## Monte Carlo Maximum Likelihood Results:
## 
##                             Estimate Std. Error MCMC % z value Pr(>|z|)    
## edgecov.dyadic_trade[[1]]  1.813e-02  2.066e-03      0   8.777   <1e-04 ***
## nodecov.population        -8.873e-02  1.760e-03      0 -50.427   <1e-04 ***
## nodecov.trade_open         1.896e-05  1.564e-04      0   0.121    0.903    
## twopath                    1.743e-02  9.250e-04      0  18.837   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 35711  on 25760  degrees of freedom
##  Residual Deviance: 22951  on 25756  degrees of freedom
##  
## AIC: 22959  BIC: 22991  (Smaller is better. MC Std. Err. = 3.146)

# After adding twopaths, trade_open lost its significance.

Longitudinal Analysis of FTA formation: BTERGM

# Fit a plain vanilla base btergm
FTA.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open"))

# Adding memory term, note that multiple memory term options cannot be used together (you can only choose one)

# NOT RUN
# Dyadic stability (type = "stability"): checks whether both edges and non-edges are stable between previous and current network
FTA_stability.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "stability"))

# You may also consider positive autoregression (type = "autoregression"): checks whether previous ties are carried over to the current network
# Lagged by 1 period
FTA_auto.btergm <- btergm(FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "autoregression", lag = 1))

Results.

# btergm FTA estimation results
FTA.btergm <- readRDS(url("https://www.dropbox.com/scl/fi/x4c2c56o21ryxarlhfjbw/FTA.btergm.rds?rlkey=z7hu0v1z7u2qf21vne1t2r1ix&st=475rvyzi&dl=1"))
FTA_stability.btergm <- readRDS(url("https://www.dropbox.com/scl/fi/0yf039ydu9ohuxzq5ba7r/FTA_stability.btergm.rds?rlkey=gvp889g0r4v4igdcxsxkoe3sw&st=r7estp39&dl=1"))

# Results at a glance
summary(FTA.btergm)

## ==========================

## Summary of model fit

## ==========================

## 
## Formula:   FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open")

## Time steps: 5

## Bootstrapping sample size: 500

## Estimates and 95% confidence intervals:

##                              Estimate   Boot mean    2.5%   97.5%
## edgecov.dyadic_trade[[i]]  5.6619e-02  0.05712394  0.0518  0.0635
## nodecov.population        -6.9177e-02 -0.06946778 -0.0735 -0.0654
## nodecov.trade_open        -6.2946e-05 -0.00007736 -0.0002  0.0000

summary(FTA_stability.btergm)

## ==========================

## Summary of model fit

## ==========================

## 
## Formula:   FTA ~ edgecov(dyadic_trade) + nodecov("population") + nodecov("trade_open") + memory(type = "stability")

## Time steps: 4

## Bootstrapping sample size: 500

## Estimates and 95% confidence intervals:

##                            Estimate Boot mean    2.5%   97.5%
## edgecov.dyadic_trade[[i]] 0.0309516 0.0243176 -0.0014  0.0372
## nodecov.population        0.0144705 0.0183102  0.0117  0.0291
## nodecov.trade_open        0.0018269 0.0015195 -0.0007  0.0029
## edgecov.memory[[i]]       5.8766994 6.0160961  5.3119  7.3824

# Only population and structural variables remain significant.

Assessing prediction accuracy of BTERGM

# NOT RUN
# Use simulated networks (via gof() function) from the estimation of [t, t+1] data and compare them to the actual network at t+2

# Model 1 (baseline model)

pred1_auc <- list()

for (i in 1:3){       # A total of 3 predictions will be made
  train_t <- c(i, i+1)  # Set time frame (use two years' data as training data)
  dy_trade <- dyadic_trade[train_t]  # Subset training data time frame
  
model1_fit <- btergm(FTA[train_t] ~
                  edgecov(dy_trade) + nodecov("population") + nodecov("trade_open"),  # edge and nodal covariates
                  R = 1000,   # 1000 bootstrap replications
                  parallel = "multicore",   # Parallel and multi-core processing
                  ncpus = 4)  # Number of CPU cores used

model1_test <- i+2  # Number of predictions: 3 predictions

g <- gof(model1_fit, statistics = c(rocpr), nsim = 100, target = FTA[model1_test])  # Retrieve ROC-Precision metric

pred1_auc[[i]] <- g$`Tie prediction`  # Store tie prediction result in pred1_auc list

}


# Model 2 (baseline model w/ stability term)

pred2_auc <- list()

for (i in 1:3){    
  train_t <- c(i+1, i+2)  
  dy_trade <- dyadic_trade[train_t] 
  
model2_fit <- btergm(FTA[train_t] ~
                  edgecov(dy_trade) + nodecov("population") + nodecov("trade_open") +  + memory(type = "stability"),  
                  R = 1000,   # 1000 bootstrap replications
                  parallel = "multicore",   # Parallel and multi-core processing
                  ncpus = 4)  # Number of CPU cores used

model2_test <- i+2

g <- gof(model2_fit, statistics = c(rocpr), nsim = 100, target = FTA[model2_test]) 

pred2_auc[[i]] <- g$`Tie prediction` 

}

Create dataframe for plotting

# Load previously generated prediction result list
pred1_auc <- readRDS(url("https://www.dropbox.com/scl/fi/hzws9swlsytyybextjn9r/pred1_auc.rds?rlkey=c2hsmntk3w18o9n4ci3lcvcpk&st=1gmvilhu&dl=1"))
pred2_auc <- readRDS(url("https://www.dropbox.com/scl/fi/ru05hs3xymsqzlfdq8byv/pred2_auc.rds?rlkey=fbr3pmjgzuek7h4x4e9tl46yd&st=l1da7non&dl=1"))


# Take a look at the estimated model AUC
pred1_auc[[1]]

##

##   ROC model ROC random  PR model PR random
## 1 0.5444093  0.5103156 0.2170958 0.1958238

# As expected, PRs are generally lower

pred2_auc[[1]]

##

##   ROC model ROC random  PR model PR random
## 1 0.9610709  0.5091163 0.9487324 0.1925754

# After accounting for the ties of previous networks, both ROC AUC and PR AUC rose to near .99 

# Create dataframe and fill two new data columns with NA
mod1_auc <- data.frame(Model = "Baseline", year = 2017:2019)
mod1_auc$auc.roc <- NA
mod1_auc$auc.pr <- NA

for (i in 1:3){
  mod1_auc$auc.roc[i] <- pred1_auc[[i]]$auc.roc  # Subset the AUC of ROC
  mod1_auc$auc.pr[i] <- pred1_auc[[i]]$auc.pr   # Subset the AUC of PR
}


mod2_auc <- data.frame(Model = "Dyadic Stability", year = 2017:2019)
mod2_auc$auc.roc <- NA
mod2_auc$auc.pr <- NA

for (i in 1:3){
  mod2_auc$auc.roc[i] <- pred2_auc[[i]]$auc.roc
  mod2_auc$auc.pr[i] <- pred2_auc[[i]]$auc.pr
}


# Combine data
df <- rbind(mod1_auc, mod2_auc)
df$year <- as.factor(df$year)

# ROC & PR AUC
library(ggplot2)

roc_auc <- ggplot(df, aes(x = year, y = auc.roc, group = Model)) +
  geom_line(aes(linetype = Model, color = Model), lwd = 0.75, alpha = 0.75) +
  scale_color_manual(values = c("#F8766D", "#00BFC4")) +
  scale_linetype_manual(values = c("dashed", "solid")) +
  theme_bw() +
  labs(title = "ROC", x ="Year", y = "AUC") +
  theme(plot.title = element_text(hjust = 0.5), legend.position = "none")

# PR AUC

pr_auc <- ggplot(df, aes(x = year, y = auc.pr, group = Model)) +
  geom_line(aes(linetype = Model, color = Model), lwd = 0.75, alpha = 0.75) +
  scale_color_manual(values = c("#F8766D", "#00BFC4")) +
  scale_linetype_manual(values = c("dashed", "solid")) +
  theme_bw() +
  labs(title="Precision-Recall", x = "Year", y = "AUC") +
  theme(plot.title = element_text(hjust = 0.5), legend.box.background = element_rect(colour = "black"))


# Load required packages
library(gridExtra)
library(grid)
library(cowplot)

# Render two subplots side-by-side
plot_grid(roc_auc, pr_auc, nrow = 1, rel_widths = c(0.4, 0.6))

Statistical Learning: Modeling Social Networks

H. K. Tseng

5/20/2025

Re-analyze FTA formation using static and temporal Exponential Random Graphs Models (ERGM)

Longitudinal Analysis of FTA formation: BTERGM

Assessing prediction accuracy of BTERGM

Create dataframe for plotting