Network Maniupulation Script

This script contains code computing indegree/outdegree, homophily measures, and visualising both networks and sub-communities within networks

# Read in your edgelist. We will use 2021 as an example
edges_21 = read.csv("~/Downloads/edgeList_closeFrds_fa21.csv")
# Creating a nodelist of all nominators and nominees
nodes_21 = data.frame(PID=unlist(edges_21, use.names = FALSE)) %>% 
  distinct(PID, .keep_all = FALSE) 

# Creating our tidygraph object from the edgelist and nodelist. g_21 is the name of our graph object
g_21 = graph_from_data_frame(edges_21, directed = TRUE, vertices = nodes_21) 
g_21 = as_tbl_graph(g_21)

Indegree and Outdegree

# tidygraph/ggraph make it very simple to compute in-degree and out-degree for each participant

nodes_21 = nodes_21 %>% 
  mutate(degree = degree(g_21)) %>% 
  mutate(indegree = degree(g_21, mode = "in")) %>% 
  mutate(outdegree = degree(g_21, mode = "out"))

Homophily

# We can join different participant information to our node-list, and generate different network stats. eg. we can join the dorm of each person in the nodelist and calculate network modularity (homophily) by dorm. Modularity scores of +1 means a high degree of dorm homophily, and -1 means a high degree of dorm heterophily 

# Reading our PID information csv which contains dorm information

PID_21 = read.csv("/Volumes/GoogleDrive/Shared drives/Box SSNL Folder/SSNL Social Networks Study/2021-2022/Rosters/Housing Rosters/PIDinfo_long_2021-2022_autoentry_Special_Characters_Changed_wHousing.csv") %>% select(PID,dorm, DID, cohortYear)

# Joining PID info to our node-list

# Removing duplicates
PID_21 = PID_21 %>% 
  distinct(PID, .keep_all = TRUE)

nodes_21 = nodes_21 %>% 
  left_join(PID_21, by = "PID")


# We have to re-create our network as a tidygraph object if we add additional variables

g_21 = graph_from_data_frame(edges_21, directed = TRUE, vertices = nodes_21) 
g_21 = as_tbl_graph(g_21)

# Network Modularity by dorm using igraph package
modularity(g_21, as.factor(V(g_21)$dorm))
## [1] 0.4290863
# eg.2 - We can also join trait information and calculate network stats using that

# Reading in our trait data

trait_21 = read.csv("~/Downloads/df.trait_fa21.csv") %>% 
  select(PID, partyID)

# Let's calculate modularity by partyID

# Removing Duplicates
trait_21 = trait_21 %>% 
  distinct(PID, .keep_all = TRUE)

# Joining our new info to our nodelist

nodes_21 = nodes_21 %>% 
  left_join(trait_21, by = "PID")

# Recreate tidygraph object

g_21 = graph_from_data_frame(edges_21, directed = TRUE, vertices = nodes_21) 
g_21 = as_tbl_graph(g_21)

# Modularity by partyID

modularity(g_21, as.factor(V(g_21)$partyID))
## [1] -0.08084813

Network Visualisation

# This is some code that does a basic visualisation of our Stanford undergrad network. You can assign nodes as different colors. In this case, each node is assigned a color based on it's partyID

g_21 %>%
  activate(nodes) %>% 
  ggraph(layout = 'stress') +
  geom_edge_fan(width = .1, color = 'lightblue') + 
  geom_node_point(aes(color = partyID), size = 0.05) + 
  guides(color = guide_legend(override.aes = list(size = 5))) +
  coord_fixed() + 
  theme_graph() 

# In this example, there are lots of NA's so it might be worth removing them from the nodelist and re-running the code
# Lets remove these partyID NA's from our network. We want to look at our nodelist (our list of all people in the network), and remove those who we don't have political data for 

nodes_21 = nodes_21 %>% 
  filter(!is.na(partyID)) %>% 
  filter(partyID != "")

 # Now lets filter our edgelist to only include connections in our new nodelist

edges_21 = edges_21 %>% 
  filter(PID %in% nodes_21$PID & nom %in% nodes_21$PID)

# Remaking our tidygraph object

g_21 = graph_from_data_frame(edges_21, directed = TRUE, vertices = nodes_21) 
g_21 = as_tbl_graph(g_21)
# Now lets re-visualise our network now all NA's have been removed

g_21 %>%
  activate(nodes) %>% 
  ggraph(layout = 'stress') +
  geom_edge_fan(width = .1, color = 'lightblue') + 
  geom_node_point(aes(color = partyID), size = 0.05) + 
  guides(color = guide_legend(override.aes = list(size = 5))) +
  coord_fixed() + 
  theme_graph() 

# Clean!
# Let's see if removing NA's changes our modularity measure. 

modularity(g_21, as.factor(V(g_21)$partyID))
## [1] 0.04418045
# We've gone from -0.08084813 to 0.04418045 by removing all NA's. Bear in mind our network is much smaller now we've removed all people who didn't submit political data 

Community Detection (Under Construction)

# Community detection

# We can use igraph functions to look at sub-communities that appear in our network (clusters of students, aka a friendship group). 

# This example calculates community detection based on greedy (yum) optimization of modularity https://kateto.net/netscix2016.html
cfg <- cluster_fast_greedy(as.undirected(g_21))

# Let's visualise this network
plot(cfg, as.undirected(g_21))

# This plot isn't great. Lets spruce it up so we can see whats going on. We'll remove the node labels, 
plot(cfg, as.undirected(g_21), vertex.label=NA, vertex.size=2, edge = NA, mark.groups=c(1,2,3,4))