Harry Potter Network Exploration: Welcome to Harry Potter Network Exploration

#This code chunk will contain all necessary libraries and packages for this analysis.

library(dplyr)
library(ggplot2)
library(igraph)
library(igraphdata)
library(statnet)

#This code chunk will contain all necessary libraries and packages for this analysis.

library(dplyr)
library(ggplot2)
library(igraph)
library(igraphdata)
library(statnet)

#Set my wd to my own path to the data
setwd("~/DACCS R/Networks/HP Data/bossaert_meidert_harrypotter")

# Read data
book4 <- as.matrix(read.table("hpbook4.txt"))
hp.attributes <- as.matrix(read.table("hpattributes.txt", header=TRUE))
hp.name <- as.matrix(read.table("hpnames.txt", header=TRUE))

Next, I need to create network objects.

book4.ig <- graph.adjacency(book4)

book4.stat <- as.network.matrix(book4)

Remove the loops in the creation of the igraph object:

book4a.ig <- graph.adjacency(book4, diag = 0)

#plot(book4a.ig)

simple4ege <- get.edgelist(book4a.ig)

simple4a.ig <- graph.edgelist(simple4ege)

#plot(simple4a.ig)

Okay, so I’m going to try and replace the column names:

colnames(work4)

 [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
[12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
[23] "V23" "V24" "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33"
[34] "V34" "V35" "V36" "V37" "V38" "V39" "V40" "V41" "V42" "V43" "V44"
[45] "V45" "V46" "V47" "V48" "V49" "V50" "V51" "V52" "V53" "V54" "V55"
[56] "V56" "V57" "V58" "V59" "V60" "V61" "V62" "V63" "V64"

as.vector(names)

# A tibble: 1 × 64
  ` 1`     ` 2`  ` 3`  ` 4`  ` 5`  ` 6`  ` 7`  ` 8`  ` 9`  `10`  `11` 
  <chr>    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Adrian … Alic… Ange… Anth… Blai… C. W… Cedr… Cho … Coli… Corm… Dean…
# … with 53 more variables: `12` <chr>, `13` <chr>, `14` <chr>,
#   `15` <chr>, `16` <chr>, `17` <chr>, `18` <chr>, `19` <chr>,
#   `20` <chr>, `21` <chr>, `22` <chr>, `23` <chr>, `24` <chr>,
#   `25` <chr>, `26` <chr>, `27` <chr>, `28` <chr>, `29` <chr>,
#   `30` <chr>, `31` <chr>, `32` <chr>, `33` <chr>, `34` <chr>,
#   `35` <chr>, `36` <chr>, `37` <chr>, `38` <chr>, `39` <chr>,
#   `40` <chr>, `41` <chr>, `42` <chr>, `43` <chr>, `44` <chr>, …

colnames(work4) <- names

colnames(work4)

 [1] "Adrian Pucey"           "Alicia Spinnet"        
 [3] "Angelina Johnson"       "Anthony Goldstein"     
 [5] "Blaise Zabini"          "C. Warrington"         
 [7] "Cedric Diggory"         "Cho Chang"             
 [9] "Colin Creevey"          "Cormac McLaggen"       
[11] "Dean Thomas"            "Demelza Robins"        
[13] "Dennis Creevey"         "Draco Malfoy"          
[15] "Eddie Carmichael"       "Eleanor Branstone"     
[17] "Ernie Macmillan"        "Euan Abercrombie"      
[19] "Fred Weasley"           "George Weasley"        
[21] "Ginny Weasley"          "Graham Pritchard"      
[23] "Gregory Goyle"          "Hannah Abbott"         
[25] "Harry James Potter"     "Hermione Granger"      
[27] "Jimmy Peakes"           "Justin Finch-Fletchley"
[29] "Katie Bell"             "Kevin Whitby"          
[31] "Lavender Brown"         "Leanne"                
[33] "Lee Jordan"             "Lucian Bole"           
[35] "Luna Lovegood"          "Malcolm Baddock"       
[37] "Mandy Brocklehurst"     "Marcus Belby"          
[39] "Marcus Flint"           "Michael Corner"        
[41] "Miles Bletchley"        "Millicent Bulstrode"   
[43] "Natalie McDonald"       "Neville Longbottom"    
[45] "Oliver Wood"            "Orla Quirke"           
[47] "Owen Cauldwell"         "Padma Patil"           
[49] "Pansy Parkinson"        "Parvati Patil"         
[51] "Penelope Clearwater"    "Percy Weasley"         
[53] "Peregrine Derrick"      "Roger Davies"          
[55] "Romilda Vane"           "Ronald Weasley"        
[57] "Rose Zeller"            "Seamus Finnigan"       
[59] "Stewart Ackerley"       "Susan Bones"           
[61] "Terry Boot"             "Theodore Nott"         
[63] "Vincent Crabbe"         "Zacharias Smith"

HOLY CRAP I THINK I DID IT

#work4 is our working dataframe - convert to a matrix

work4 <- as.matrix(work4)

#convert to graph object

work4.ig <- graph.adjacency(work4, diag = 0)

#now we're going to get only the connected part of the graph

work4edge <- get.edgelist(work4.ig)

work4_simple.ig <- graph.edgelist(work4edge)

library(network)
library(sna)
library(ggraph)
library(visNetwork)
library(threejs)
library(networkD3)
library(ndtv)

Plot:

ggraph(work4_simple.ig, layout = 'lgl') +

geom_edge_arc(color="gray", strength=0.3, arrow = arrow(length = unit(4, 'mm'))) +

geom_node_point() +

geom_node_text(aes(label = name), size=3, repel=T) +

theme_void() +

labs(title = "Student Support Network in Harry Potter and the Goblet of Fire")

book4a.ig <- graph.adjacency(book4, diag = 0)

#First attempt - guessing I actually already did this, oh well!

work4_adjacency<-as.matrix(as_adjacency_matrix(work4.ig))

old4 <- work4_adjacency

old4.ig <- graph.adjacency(old4)

class(old4.ig)

[1] "igraph"

new4.ig <- igraph::delete.vertices(old4.ig, which(igraph::degree(old4.ig)==0))

#plot(new4.ig)

Create Nodes Dataframe

Note to self: use new4.ig, new4.stat

#Create nodes df for new and add all degree

#Confirm degree is numeric

new4.nodes<-data.frame(name=V(new4.ig)$name, degree=igraph::degree(new4.ig))

#add in degree and out degree

new4.nodes <- new4.nodes %>%

mutate(indegree=igraph::degree(new4.ig, mode="in", loops=FALSE),

outdegree=igraph::degree(new4.ig, mode="out", loops=FALSE))

#add eigenCentrality

newEigen <- centr_eigen(new4.ig, directed = T)

newEigen$vector

 [1] 2.460367e-01 6.053405e-02 1.444513e-18 0.000000e+00 9.394659e-01
 [6] 9.394659e-01 2.311431e-01 1.000000e+00 9.394659e-01 7.083229e-01
[11] 0.000000e+00 9.394659e-01

newEigen$centralization

[1] 0.5451

new4.eigen<-cbind(newEigen$vector,V(new4.ig)$name)

new4.eigenDF <- data_frame(new4.eigen)

#add to our nodes

new4.nodes <- new4.nodes %>%

mutate(eigenCentrality = new4.eigen)

Calculate derived and reflected centrality and add to nodes df

old4_adj <- as.matrix(graph.adjacency(old4))

new4_adj <- as.matrix(as_adjacency_matrix(new4.ig))

old4.stat <- as.network.matrix(old4_adj)

new4.stat <- as.network.matrix(new4_adj)

print(new4.stat)

 Network attributes:
  vertices = 12 
  directed = TRUE 
  hyper = FALSE 
  loops = FALSE 
  multiple = FALSE 
  bipartite = FALSE 
  total edges= 34 
    missing edges= 0 
    non-missing edges= 34 

 Vertex attribute names: 
    vertex.names 

No edge attributes

#Matrix to 2nd power for two step derived/reflected centrality measurement.

new4_adjacency2<-new4_adj %*% new4_adj

#Calculate the proportion of reflected centrality.

new_rc<-diag(as.matrix(new4_adjacency2))/rowSums(as.matrix(new4_adjacency2))

new_rc<-ifelse(is.nan(new_rc),0,new_rc)

head(new_rc)

Cedric Diggory      Cho Chang  Colin Creevey Dennis Creevey 
     0.1428571      0.0000000      0.1250000      0.1250000 
  Fred Weasley George Weasley 
     0.2000000      0.1904762

#Calculate the proportion of derived centrality.

new_dc<-1-diag(as.matrix(new4_adjacency2))/rowSums(as.matrix(new4_adjacency2))

new_dc<-ifelse(is.nan(new_dc),1,new_dc)

head(new_dc)

Cedric Diggory      Cho Chang  Colin Creevey Dennis Creevey 
     0.8571429      1.0000000      0.8750000      0.8750000 
  Fred Weasley George Weasley 
     0.8000000      0.8095238

#Add to nodes

new4.nodes <- new4.nodes %>%

mutate(derCentrality = new_dc,

reflectedCentrality = new_rc)

Now we’ll add in our closesness, betweeness and brokerage scores:

#calculate closeness centrality: igraph

closeIG <- igraph::closeness(new4.ig)

head(igraph::closeness(new4.ig))

Cedric Diggory      Cho Chang  Colin Creevey Dennis Creevey 
    0.06666667     0.06250000     0.05000000     0.05000000 
  Fred Weasley George Weasley 
    0.08333333     0.07692308

#calculate closeness centrality: statnet

head(sna::closeness(new4.stat))

[1] 0 0 0 0 0 0

closeIG_df <- data_frame(closeIG)

new4.nodes <- new4.nodes %>%

mutate(closeness = closeIG_df$closeIG)

#Betweeness Centrality

head(igraph::betweenness(new4.ig, directed=TRUE, weights=NA))

Cedric Diggory      Cho Chang  Colin Creevey Dennis Creevey 
             8              0              0              0 
  Fred Weasley George Weasley 
             9              0

betweenIG <- igraph::betweenness(new4.ig, directed=TRUE, weights=NA)

betweenIG_df <- data_frame(betweenIG)

new4.nodes <- new4.nodes %>%

mutate(between = betweenIG_df$betweenIG)

Adding constraint

constraintIG <- constraint(new4.ig)

constraintIG_df <- data_frame(constraintIG)

new4.nodes <- new4.nodes %>%

mutate(constraint = constraintIG_df$constraintIG)

Brokerage:

#return matrix of standardized brokerage scores

head(brokerage(new4.stat, cl = new4.nodes$name)$z.nli)

               w_I w_O b_IO b_OI        b_O          t
Cedric Diggory NaN NaN  NaN  NaN -1.0136651 -1.0136651
Cho Chang      NaN NaN  NaN  NaN -1.2430947 -1.2430947
Colin Creevey  NaN NaN  NaN  NaN -1.2430947 -1.2430947
Dennis Creevey NaN NaN  NaN  NaN -1.2430947 -1.2430947
Fred Weasley   NaN NaN  NaN  NaN -0.3253764 -0.3253764
George Weasley NaN NaN  NaN  NaN -1.2430947 -1.2430947

Quick Reminder of the types of brokerage:

Liaison: all three actors occupy different groups
Representative: one member of a subgroup has role (either assumed or assigned) of communicating/negotiating with outsiders
Gatekeeper: actor screens or gathers resources and distributes them to members of own group
Itinerant/Cosmopolitan: principals belong to same group while intermediary belongs to a different group
Coordinator: all actors belong to the same group

b_O: Liaison role; the broker mediates contact between two individuals from different groups, neither of which is the group to which he or she belongs. Two-path structure: A -> B -> C

In this evaluation, liaison and total brokerage are the same, so we’ll add it to the nodes df as liaison/total

brokerage <- brokerage(new4.stat, cl = new4.nodes$name)$z.nli

brokerage_df <- as.data.frame(brokerage)

new4.nodes <- new4.nodes %>%

mutate(liaison_total = brokerage_df$b_O)

[Note: I tried to add the Bonacich measures and they continue to fail.]

Now I’m going to try and run some network statistics based on our Week 9 tutorial code.

First, we’ll take a look at transitivity of our Harry Potter support network. We are using the reduced network that has all isolates removed.

Recall:

#new4.stat <- as.network.matrix(book4)
print(new4.stat)

 Network attributes:
  vertices = 12 
  directed = TRUE 
  hyper = FALSE 
  loops = FALSE 
  multiple = FALSE 
  bipartite = FALSE 
  total edges= 34 
    missing edges= 0 
    non-missing edges= 34 

 Vertex attribute names: 
    vertex.names 

No edge attributes

sna::dyad.census(new4.stat)

     Mut Asym Null
[1,]  12   10   44

Triad Census

sna::triad.census(new4.stat)

     003 012 102 021D 021U 021C 111D 111U 030T 030C 201 120D 120U
[1,]  78  36  44    0    5    5   21   11    0    0   4    1    4
     120C 210 300
[1,]    1   0  10

Global Transitivity

#get network transitivity: igraph
transitivity(new4.ig)

[1] 0.5106383

Graph Density

graph.density(new4.ig)

[1] 0.2575758

Now we’ll assess whether our HP Network is different from a random network.

First, condition on: size

[Do I need to use a matrix object? Or can I use a graph object? This is an SNA function.]

#compare network transitivity to null conditional on size
trans.cug<-cug.test(new4.stat, FUN=gtrans, mode="digraph", cmode="size")
trans.cug


Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.6016949 
Pr(X>=Obs): 0.016 
Pr(X<=Obs): 0.984

Histogram with observed compared to replication statistics

#plot vs simulation results
plot(trans.cug)

From this test, conditioning on size, the probability that our HP Network transitivity is greater than random is p <= .015, which is stastitically significant (using p<=.05 as the measure for significance).

We can also test this:

#function to return t statistic
cug.t<-function(cug.object){
  (cug.object$obs.stat-mean(cug.object$rep.stat))/sd(cug.object$rep.stat)
}

#t-stat between observed and simulated networks
cug.t(trans.cug)

[1] 2.100745

[This is where I had question on the original tutorial - In fact, if we wanted to know how many standard errors away the observed transitivity value is from what we would expect, on average, this can be calulated using the cug.test object as follows. So am I correct that this metric that’s returned is a standard error, not standard deviation? I still find this confusing!]

We can now run similar analyses using different metrics.

For this network, I’m also interested in conditioning on edges:

#compare network transitivity to null conditional on size
transE.cug<-cug.test(new4.stat, FUN=gtrans, mode="digraph", cmode="edges")
transE.cug


Univariate Conditional Uniform Graph Test

Conditioning Method: edges 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.6016949 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1

plot(transE.cug)

I interpret this result to say that our HP network is significantly higher in transitivity (conditioning on edges) than random. This makes sense, particularly as this is a support network with isolates removed. Let’s run this on the full network and see what we get!

#compare network transitivity to null conditional on size
transEfull.cug<-cug.test(book4.stat, FUN=gtrans, mode="digraph", cmode="edges")
transEfull.cug


Univariate Conditional Uniform Graph Test

Conditioning Method: edges 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.6016949 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1

plot(transEfull.cug)

Hm. So adding in our isolates doesn’t change anything about our transitivity?

Moving on!

Now I’m going to test on network degree centralization, again looking at our smaller network without isolates and our larger with the isolates.

without isolates

#compare network degree centralization to null conditional on size
c.degree.cugSmall <-cug.test(new4.stat,FUN=centralization,  FUN.arg=list(FUN=degree, cmode="indegree"), mode="digraph", cmode="size") 
#plot vs simulation results
plot(c.degree.cugSmall)

Again, a result that makes total sense, given the way this network was constructed.

with isolates

#compare network degree centralization to null conditional on size
c.degree.cugFull <-cug.test(book4.stat, FUN=centralization,  FUN.arg=list(FUN=degree, 
                              cmode="indegree"), mode="digraph", cmode="size") 
c.degree.cugFull


Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.1365583 
Pr(X>=Obs): 0.635 
Pr(X<=Obs): 0.37

#plot vs simulation results
plot(c.degree.cugFull)

Now this looks very different! Let’s run the t statistic.

#t-stat between observed and simulated networks
cug.t(c.degree.cugFull)

[1] -0.4361202

So with our isolates included, this network does not look different from a random network!

Now let’s look at betweenness:

No Isolates

#compare network betweenness centralization to null conditional on size
b.degree.cugSmall <-cug.test(new4.stat, FUN=centralization,  FUN.arg=list(FUN=betweenness, cmode="directed"), mode="digraph", cmode="size", reps=100) 
b.degree.cugSmall


Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 100 

Observed Value: 0.3661157 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1

#plot vs simulation results
plot(b.degree.cugSmall)

So, without our isolates, this is very different from a random network.

With Isolates

#compare network betweenness centralization to null conditional on size
b.degree.cugFll <-cug.test(book4.stat, FUN=centralization,  FUN.arg=list(FUN=betweenness, cmode="directed"), mode="digraph", cmode="size", reps=100) 
b.degree.cugFll


Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: digraph 
Diagonal Used: FALSE 
Replications: 100 

Observed Value: 0.0107107 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1

#plot vs simulation results
plot(b.degree.cugFll)

This is very interesting to me! That even with our isolates, we still have a higher measure of betweenness for this network than random. I wonder if this is because Harry’s betweenness score is very high? And that drives up the whole network?

Now we’ll test if this network looks like a preferential attachment network comparing to our network with isolates removed.

library(intergraph)

#create empty dataframe for simulations
trials<-data.frame(id=1:100, gdens=NA, gtrans=NA, cent.deg=NA, cent.bet=NA)

#simulate PA networks and add stats to trials dataframe: size
for ( i in 1:100 ){ 
  pa.ig<- igraph::sample_pa(n = network.size(new4.stat), directed=TRUE)
  pa.stat<-intergraph::asNetwork(pa.ig)
  trials$gdens<-gden(pa.stat)
  trials$gtrans[i] <- gtrans(pa.stat)
  trials$cent.deg[i] <- centralization(pa.stat, FUN=degree, cmode="indegree")
  trials$cent.bet[i] <-centralization(pa.stat, FUN=betweenness)
}

summary(trials)

       id             gdens             gtrans     cent.deg     
 Min.   :  1.00   Min.   :0.08333   Min.   :0   Min.   :0.2066  
 1st Qu.: 25.75   1st Qu.:0.08333   1st Qu.:0   1st Qu.:0.3058  
 Median : 50.50   Median :0.08333   Median :0   Median :0.4050  
 Mean   : 50.50   Mean   :0.08333   Mean   :0   Mean   :0.4615  
 3rd Qu.: 75.25   3rd Qu.:0.08333   3rd Qu.:0   3rd Qu.:0.6033  
 Max.   :100.00   Max.   :0.08333   Max.   :0   Max.   :0.9008  
    cent.bet       
 Min.   :0.006612  
 1st Qu.:0.023554  
 Median :0.036364  
 Mean   :0.042810  
 3rd Qu.:0.058884  
 Max.   :0.129752

sim.t<-function(g, trials){
  temp<-data.frame(density=c(gden(g),mean(trials$gdens),sd(trials$gdens)),
             transitivity=c(gtrans(g),mean(trials$gtrans),sd(trials$gtrans)),
             indegCent=c(centralization(g, FUN=degree, cmode="indegree"),mean(trials$cent.deg), sd(trials$cent.deg)),
             betwCent=c(centralization(g, FUN=betweenness), mean(trials$cent.bet), sd(trials$cent.bet)))
  rownames(temp)<-c("Observed","Simulated", "SD")
  temp<-data.frame(t(temp))
  temp$tvalue<-(temp$Observed-temp$Simulated)/temp$SD
  temp
}

plot.sim.t<-function(g,trials){
  temp<-data.frame(net.stat=c("gtrans","cent.deg","cent.bet"), x=c(gtrans(g),centralization(g, FUN=degree, cmode="indegree"), centralization(g, FUN=betweenness)))
  trials%>%
    select(gtrans:cent.bet)%>%
    gather(key="net.stat",value = "estimate")%>%
    ggplot(aes(estimate)) +
    geom_histogram() +
    facet_wrap(net.stat ~ ., scales="free", ncol=3) +
    geom_vline(data=temp, aes(xintercept=x),
               linetype="dashed", size=1, colour="red")
}

#check for differences from null
sim.t(g=new4.stat,trials)

              Observed  Simulated         SD     tvalue
density      0.2575758 0.08333333 0.00000000        Inf
transitivity 0.6016949 0.00000000 0.00000000        Inf
indegCent    0.6115702 0.46148760 0.15963585  0.9401563
betwCent     0.3661157 0.04280992 0.02605203 12.4100023

plot.sim.t(new4.stat, trials)

So, this is interesting, which is that both betweenness and transitivity are very different from random, but degree centrality is not. So, even though this network looks random based on number of connections, it’s actually not, because of the nature of those connections? At least in comparison to a preferential attachment network.