Social Network Analysis

Steward Bank

library('tidyverse')
Warning: package 'tidyverse' was built under R version 4.2.1
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
Warning: package 'ggplot2' was built under R version 4.2.1
Warning: package 'tibble' was built under R version 4.2.1
Warning: package 'tidyr' was built under R version 4.2.1
Warning: package 'readr' was built under R version 4.2.1
Warning: package 'purrr' was built under R version 4.2.1
Warning: package 'dplyr' was built under R version 4.2.1
Warning: package 'stringr' was built under R version 4.2.1
Warning: package 'forcats' was built under R version 4.2.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(janitor)
Warning: package 'janitor' was built under R version 4.2.1

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
sourcedestination <- read_csv("data/sourcedestination.csv") %>% clean_names() 
Rows: 7 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): From Acoount, To Account
dbl (1): Frequency

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
source_accountDestination <- read_csv("data/source_accountDestination.csv") %>% clean_names() 
Rows: 7 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Source, Destination

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
 customerAge <- read_csv("data/customerAge.csv") %>% clean_names()
Rows: 7 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Name
dbl (1): age

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Undirected edgelist

library(igraph)
Warning: package 'igraph' was built under R version 4.2.1

Attaching package: 'igraph'
The following objects are masked from 'package:dplyr':

    as_data_frame, groups, union
The following objects are masked from 'package:purrr':

    compose, simplify
The following object is masked from 'package:tidyr':

    crossing
The following object is masked from 'package:tibble':

    as_data_frame
The following objects are masked from 'package:stats':

    decompose, spectrum
The following object is masked from 'package:base':

    union
g <- graph.edgelist(as.matrix(source_accountDestination),
directed = FALSE)
g
IGRAPH bc1823c UN-- 7 7 -- 
+ attr: name (v/c)
+ edges from bc1823c (vertex names):
[1] Valentine--Mcnamara  Valentine--Loid      Valentine--Nyaradzo 
[4] Valentine--Stephanie Valentine--George    Stephanie--George   
[7] George   --Tinashe  

Directed Edgelist

D <- graph.edgelist(as.matrix(source_accountDestination),
directed = TRUE)
D
IGRAPH bc2e018 DN-- 7 7 -- 
+ attr: name (v/c)
+ edges from bc2e018 (vertex names):
[1] Valentine->Mcnamara  Valentine->Loid      Valentine->Nyaradzo 
[4] Valentine->Stephanie Valentine->George    Stephanie->George   
[7] George   ->Tinashe  

Inspecting our source_destination key

V(g)
+ 7/7 vertices, named, from bc1823c:
[1] Valentine Mcnamara  Loid      Nyaradzo  Stephanie George    Tinashe  
E(g)
+ 7/7 edges from bc1823c (vertex names):
[1] Valentine--Mcnamara  Valentine--Loid      Valentine--Nyaradzo 
[4] Valentine--Stephanie Valentine--George    Stephanie--George   
[7] George   --Tinashe  
gorder(g)
[1] 7
gsize(g)
[1] 7

Vizualising the Social Network undirected

plot(g)

Vizualising the Social Network directed

plot(D)

Network Attributes

Vertex attributes

g
IGRAPH bc1823c UN-- 7 7 -- 
+ attr: name (v/c)
+ edges from bc1823c (vertex names):
[1] Valentine--Mcnamara  Valentine--Loid      Valentine--Nyaradzo 
[4] Valentine--Stephanie Valentine--George    Stephanie--George   
[7] George   --Tinashe  

Edge attributes

Adding Vertex A

g <- set_vertex_attr(
g,
"age",
value = c(
20,25,21,23,24,23,22
)
)
vertex_attr(g)
$name
[1] "Valentine" "Mcnamara"  "Loid"      "Nyaradzo"  "Stephanie" "George"   
[7] "Tinashe"  

$age
[1] 20 25 21 23 24 23 22
g <- set_vertex_attr(
g,
"age",
value = customerAge$age
)
vertex_attr(g)
$name
[1] "Valentine" "Mcnamara"  "Loid"      "Nyaradzo"  "Stephanie" "George"   
[7] "Tinashe"  

$age
[1] 31 27 28 27 27 41 29

Adding Edge Attributes

g <- set_edge_attr(
g,
"frequency",
value = sourcedestination$frequency
)
edge_attr(g)
$frequency
[1] 2 1 1 1 3 2 4

Adding attributes II

graph_from_data_frame(vertices = customerAge, d = sourcedestination,
directed = FALSE)
IGRAPH bca970b UN-- 7 7 -- 
+ attr: name (v/c), age (v/n), frequency (e/n)
+ edges from bca970b (vertex names):
[1] Mcnamara --Valentine Loid     --Valentine Nyaradzo --Valentine
[4] Valentine--Stephanie George   --Valentine George   --Stephanie
[7] George   --Tinashe  

Subsetting networks

E(g)[[inc('Stephanie')]]
Warning: 'inc' is deprecated.
Use '.inc' instead.
See help("Deprecated")
+ 2/7 edges from bc1823c (vertex names):
       tail      head tid hid frequency
4 Valentine Stephanie   1   5         1
6 Stephanie    George   5   6         2
E(g)[[frequency>=3]]
+ 2/7 edges from bc1823c (vertex names):
       tail    head tid hid frequency
5 Valentine  George   1   6         3
7    George Tinashe   6   7         4

Network visualization

V(g)$color <- ifelse(
V(g)$age > 30, "red", "white"
)
plot(
g,
vertex.label.color = "black"
)

Styling vertices and edges

Choosing the appropriate layout

  1. Minimize edge crossing

  2. Do not allow vertices to overlap

  3. Make edge lengths as uniform as possible

  4. Increase symmetry of the network as much as possible

  5. Position more influential nodes towards the centre

igraph layouts

  • circle

  • fruchterman-reingold

  • kamada-kawai

  • grid

  • lgl

  • tree

plot(D, layout = layout.fruchterman.reingold(D))

plot(g, layout = layout.grid(g))

Directed networks

Examining the igraph Checking igraph objects

is.directed(g)
[1] FALSE
is.directed(D)
[1] TRUE
is.weighted(g)
[1] FALSE
is.weighted(D)
[1] FALSE

In-degree and out-degree

Is there an edge between Valentine & Mcnamara

g['Valentine','Mcnamara']
[1] 1

Find the starting vertex of all edges:

head_of(g, E(g))
+ 7/7 vertices, named, from bc1823c:
[1] Mcnamara  Loid      Nyaradzo  Stephanie George    George    Tinashe  

Show all edges to or from A: ``

incident(g,'Valentine', mode=c("all"))
+ 5/7 edges from bc1823c (vertex names):
[1] Valentine--Mcnamara  Valentine--Loid      Valentine--Nyaradzo 
[4] Valentine--Stephanie Valentine--George   
# incident(g,'A', mode=c("all"))

Relationships between vertices

Identifying neighbors

neighbors(g, "George", mode = c("all"))
+ 3/7 vertices, named, from bc1823c:
[1] Valentine Stephanie Tinashe  

Identifying neighbors in common

x <- neighbors(
g, "George", mode = c("all")
)
y <- neighbors(
g, "Stephanie", mode = c("all")
)
intersection(x,y)
+ 1/7 vertex, named, from bc1823c:
[1] Valentine

Path

farthest_vertices(g)
$vertices
+ 2/7 vertices, named, from bc1823c:
[1] Mcnamara Tinashe 

$distance
[1] 3

Identifying vertices reachable in N steps

  1. ego(g, 2, 'Nyaradzo', mode=c('out'))
    [[1]]
    + 6/7 vertices, named, from bc1823c:
    [1] Nyaradzo  Valentine Mcnamara  Loid      Stephanie George   
ego(g, 2, 'Loid', mode=c('out'))
[[1]]
+ 6/7 vertices, named, from bc1823c:
[1] Loid      Valentine Mcnamara  Nyaradzo  Stephanie George   
ego(g, 2, 'Stephanie', mode=c('out'))
[[1]]
+ 7/7 vertices, named, from bc1823c:
[1] Stephanie Valentine George    Mcnamara  Loid      Nyaradzo  Tinashe  

Important and influential vertices

Measures of vertex importance

  • degree

  • betweenness

  • eigenvector centrality

  • closeness centrality

  • pagerank centrality

Out-degree and in-degree

degree(g, mode = c("out"))
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
        5         1         1         1         2         3         1 
table(head_of(g, E(g)))

2 3 4 5 6 7 
1 1 1 1 2 1 
betweenness(D, directed = TRUE)
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
        0         0         0         0         0         2         0 
betweenness(g, directed = TRUE,
normalized = TRUE)
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
0.8000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3333333 0.0000000 

Network Structure

Eigenvector centrality

eigen_centrality(g)$vector
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
1.0000000 0.3854455 0.3854455 0.3854455 0.6782950 0.7597688 0.2928495 

Density

edge_density(g)
[1] 0.3333333

Average path length

mean_distance(g, directed = FALSE)
[1] 1.809524

Network Randomisation Random graphs

erdos.renyi.game(n = gorder(g), p.or.m = edge_density(g), type = "gnp")
IGRAPH be41e4f U--- 7 5 -- Erdos-Renyi (gnp) graph
+ attr: name (g/c), type (g/c), loops (g/l), p (g/n)
+ edges from be41e4f:
[1] 2--3 3--4 2--6 1--7 5--7

Random graphs & randomization tests

  1. Generate 1000 random graphs based on the original network
  • e.g. with the same number of vertices and approximate density.
  1. Calculate the average path length of the original network.
  2. Calculate the average path length of the 1000 random networks.
  3. Determine how many random networks have an average path length greater or less than the original network’s average path length. Generate 1000 random graphs
# Identify key nodes using eigenvector centrality
g.ec <- eigen_centrality(g)
which.max(g.ec$vector)
Valentine 
        1 
# Plot Forrest Gump Network
plot(g,
vertex.label.color = "black", 
vertex.label.cex = 0.6,
vertex.size = 25*(g.ec$vector),
edge.color = 'gray88',
main = "Forrest Gump Network"
)

Comparing to the original network

gl <- vector('list',1000)
for(i in 1:1000){
gl[[i]] <- erdos.renyi.game(
n = gorder(g),
p.or.m = edge_density(g),
type = "gnp"
)
}
gl.apls <- unlist(
lapply(gl, mean_distance, directed = FALSE)
)
hist(gl.apls, breaks = 20)
abline(
v = mean_distance(
g, directed=FALSE
),
col = "red",
lty = 3,
lwd = 2
)

ego(g, 2, 'George', mode=c('out'))
[[1]]
+ 7/7 vertices, named, from bc1823c:
[1] George    Valentine Stephanie Tinashe   Mcnamara  Loid      Nyaradzo 
degree(g, mode = c("out"))
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
        5         1         1         1         2         3         1 
I -> F -> E -> H
I -> F -> A -> E -> H
betweenness(g, directed = TRUE)
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
       12         0         0         0         0         5         0 
betweenness(g, directed = TRUE, normalized = TRUE)
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
0.8000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3333333 0.0000000 
transitivity(g)
[1] 0.2142857
eigen_centrality(g)$vector
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
1.0000000 0.3854455 0.3854455 0.3854455 0.6782950 0.7597688 0.2928495 
edge_density(g)
[1] 0.3333333
mean_distance(g, directed = FALSE)
[1] 1.809524
erdos.renyi.game(n = gorder(g), p.or.m = edge_density(g), type = "gnp")
IGRAPH beeafa1 U--- 7 9 -- Erdos-Renyi (gnp) graph
+ attr: name (g/c), type (g/c), loops (g/l), p (g/n)
+ edges from beeafa1:
[1] 1--3 2--4 3--4 1--5 2--5 4--5 2--6 4--6 2--7
gl <- vector('list',1000)
for(i in 1:1000){
gl[[i]] <- erdos.renyi.game(
n = gorder(g),
p.or.m = edge_density(g),
type = "gnp"
)
}
gl.apls <- unlist(
lapply(gl, mean_distance, directed = FALSE)
)
hist(gl.apls, breaks = 20)
abline(
v = mean_distance(
g, directed=FALSE
),
col = "red",
lty = 3,
lwd = 2
)

Network Substructures

triangles(g)
+ 3/7 vertices, named, from bc1823c:
[1] Valentine George    Stephanie
transitivity(g)
[1] 0.2142857
transitivity(g,
vids = 'Valentine',
type = "local")
[1] 0.1
count_triangles(g, vids ='Valentine')
[1] 1
count_triangles(g, vids = 'Loid')
[1] 0
transitivity(g,
vids = 'Stephanie',
type = "local")
[1] 1

Identifying cliques

largest_cliques(g)
[[1]]
+ 3/7 vertices, named, from bc1823c:
[1] Stephanie Valentine George   
max_cliques(g)
[[1]]
+ 2/7 vertices, named, from bc1823c:
[1] Nyaradzo  Valentine

[[2]]
+ 2/7 vertices, named, from bc1823c:
[1] Mcnamara  Valentine

[[3]]
+ 2/7 vertices, named, from bc1823c:
[1] Loid      Valentine

[[4]]
+ 2/7 vertices, named, from bc1823c:
[1] Tinashe George 

[[5]]
+ 3/7 vertices, named, from bc1823c:
[1] Stephanie Valentine George   

Close relationships:

assortativity & reciprocity The preferential a

assortativity.degree(
g,
directed = FALSE
)
[1] -0.7368421
reciprocity(g)
[1] 1

Community detection

fastgreedy.community(g)
IGRAPH clustering fast greedy, groups: 2, mod: 0.2
+ groups:
  $`1`
  [1] "Stephanie" "George"    "Tinashe"  
  
  $`2`
  [1] "Valentine" "Mcnamara"  "Loid"      "Nyaradzo" 
  
edge.betweenness.community(g)
IGRAPH clustering edge betweenness, groups: 2, mod: 0.2
+ groups:
  $`1`
  [1] "Valentine" "Mcnamara"  "Loid"      "Nyaradzo" 
  
  $`2`
  [1] "Stephanie" "George"    "Tinashe"  
  
x <- fastgreedy.community(g)
length(x)
[1] 2
sizes(x)
Community sizes
1 2 
3 4 
membership(x)
Valentine  Mcnamara      Loid  Nyaradzo Stephanie    George   Tinashe 
        2         2         2         2         1         1         1 
plot(x, g)

library(threejs)
Warning: package 'threejs' was built under R version 4.2.1
graphjs(g)

Adding attributes

# g <- set_vertex_attr(
# g,
# "label",
# value = V(g)$name
# )
# g <- set_vertex_attr(
# g,
# "color",
# value = "mistyrose"
# )
# graphjs(g, vertex.size = 1)

Coloring communities

{# {r} # x = edge.betweenness.community(g) # i <- membership(x) # g <- set_vertex_attr( # g, # "color", # value = c( # "yellow", "blue", "red" # )[i] # ) # graphjs(g)