Week 9 - Network Tests Exploration
#Set my wd to my own path to the data
setwd("~/DACCS R/Networks/HP Data/bossaert_meidert_harrypotter")
# Read data
book4 <- as.matrix(read.table("hpbook4.txt"))
hp.attributes <- as.matrix(read.table("hpattributes.txt", header=TRUE))
hp.name <- as.matrix(read.table("hpnames.txt", header=TRUE))
Next, I need to create network objects.
book4.ig <- graph.adjacency(book4)
book4.stat <- as.network.matrix(book4)
Remove the loops in the creation of the igraph object:
book4a.ig <- graph.adjacency(book4, diag = 0)
#plot(book4a.ig)
simple4ege <- get.edgelist(book4a.ig)
simple4a.ig <- graph.edgelist(simple4ege)
#plot(simple4a.ig)
Okay, so I’m going to try and replace the column names:
colnames(work4)
[1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
[12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
[23] "V23" "V24" "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33"
[34] "V34" "V35" "V36" "V37" "V38" "V39" "V40" "V41" "V42" "V43" "V44"
[45] "V45" "V46" "V47" "V48" "V49" "V50" "V51" "V52" "V53" "V54" "V55"
[56] "V56" "V57" "V58" "V59" "V60" "V61" "V62" "V63" "V64"
as.vector(names)
# A tibble: 1 × 64
` 1` ` 2` ` 3` ` 4` ` 5` ` 6` ` 7` ` 8` ` 9` `10` `11`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Adrian … Alic… Ange… Anth… Blai… C. W… Cedr… Cho … Coli… Corm… Dean…
# … with 53 more variables: `12` <chr>, `13` <chr>, `14` <chr>,
# `15` <chr>, `16` <chr>, `17` <chr>, `18` <chr>, `19` <chr>,
# `20` <chr>, `21` <chr>, `22` <chr>, `23` <chr>, `24` <chr>,
# `25` <chr>, `26` <chr>, `27` <chr>, `28` <chr>, `29` <chr>,
# `30` <chr>, `31` <chr>, `32` <chr>, `33` <chr>, `34` <chr>,
# `35` <chr>, `36` <chr>, `37` <chr>, `38` <chr>, `39` <chr>,
# `40` <chr>, `41` <chr>, `42` <chr>, `43` <chr>, `44` <chr>, …
[1] "Adrian Pucey" "Alicia Spinnet"
[3] "Angelina Johnson" "Anthony Goldstein"
[5] "Blaise Zabini" "C. Warrington"
[7] "Cedric Diggory" "Cho Chang"
[9] "Colin Creevey" "Cormac McLaggen"
[11] "Dean Thomas" "Demelza Robins"
[13] "Dennis Creevey" "Draco Malfoy"
[15] "Eddie Carmichael" "Eleanor Branstone"
[17] "Ernie Macmillan" "Euan Abercrombie"
[19] "Fred Weasley" "George Weasley"
[21] "Ginny Weasley" "Graham Pritchard"
[23] "Gregory Goyle" "Hannah Abbott"
[25] "Harry James Potter" "Hermione Granger"
[27] "Jimmy Peakes" "Justin Finch-Fletchley"
[29] "Katie Bell" "Kevin Whitby"
[31] "Lavender Brown" "Leanne"
[33] "Lee Jordan" "Lucian Bole"
[35] "Luna Lovegood" "Malcolm Baddock"
[37] "Mandy Brocklehurst" "Marcus Belby"
[39] "Marcus Flint" "Michael Corner"
[41] "Miles Bletchley" "Millicent Bulstrode"
[43] "Natalie McDonald" "Neville Longbottom"
[45] "Oliver Wood" "Orla Quirke"
[47] "Owen Cauldwell" "Padma Patil"
[49] "Pansy Parkinson" "Parvati Patil"
[51] "Penelope Clearwater" "Percy Weasley"
[53] "Peregrine Derrick" "Roger Davies"
[55] "Romilda Vane" "Ronald Weasley"
[57] "Rose Zeller" "Seamus Finnigan"
[59] "Stewart Ackerley" "Susan Bones"
[61] "Terry Boot" "Theodore Nott"
[63] "Vincent Crabbe" "Zacharias Smith"
HOLY CRAP I THINK I DID IT
#work4 is our working dataframe - convert to a matrix
work4 <- as.matrix(work4)
#convert to graph object
work4.ig <- graph.adjacency(work4, diag = 0)
#now we're going to get only the connected part of the graph
work4edge <- get.edgelist(work4.ig)
work4_simple.ig <- graph.edgelist(work4edge)
Plot:
ggraph(work4_simple.ig, layout = 'lgl') +
geom_edge_arc(color="gray", strength=0.3, arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point() +
geom_node_text(aes(label = name), size=3, repel=T) +
theme_void() +
labs(title = "Student Support Network in Harry Potter and the Goblet of Fire")
book4a.ig <- graph.adjacency(book4, diag = 0)
#First attempt - guessing I actually already did this, oh well!
work4_adjacency<-as.matrix(as_adjacency_matrix(work4.ig))
old4 <- work4_adjacency
old4.ig <- graph.adjacency(old4)
class(old4.ig)
[1] "igraph"
new4.ig <- igraph::delete.vertices(old4.ig, which(igraph::degree(old4.ig)==0))
#plot(new4.ig)
Create Nodes Dataframe
Note to self: use new4.ig, new4.stat
#Create nodes df for new and add all degree
#Confirm degree is numeric
new4.nodes<-data.frame(name=V(new4.ig)$name, degree=igraph::degree(new4.ig))
#add in degree and out degree
new4.nodes <- new4.nodes %>%
mutate(indegree=igraph::degree(new4.ig, mode="in", loops=FALSE),
outdegree=igraph::degree(new4.ig, mode="out", loops=FALSE))
#add eigenCentrality
newEigen <- centr_eigen(new4.ig, directed = T)
newEigen$vector
[1] 2.460367e-01 6.053405e-02 1.444513e-18 0.000000e+00 9.394659e-01
[6] 9.394659e-01 2.311431e-01 1.000000e+00 9.394659e-01 7.083229e-01
[11] 0.000000e+00 9.394659e-01
newEigen$centralization
[1] 0.5451
new4.eigen<-cbind(newEigen$vector,V(new4.ig)$name)
new4.eigenDF <- data_frame(new4.eigen)
#add to our nodes
new4.nodes <- new4.nodes %>%
mutate(eigenCentrality = new4.eigen)
Calculate derived and reflected centrality and add to nodes df
old4_adj <- as.matrix(graph.adjacency(old4))
new4_adj <- as.matrix(as_adjacency_matrix(new4.ig))
old4.stat <- as.network.matrix(old4_adj)
new4.stat <- as.network.matrix(new4_adj)
print(new4.stat)
Network attributes:
vertices = 12
directed = TRUE
hyper = FALSE
loops = FALSE
multiple = FALSE
bipartite = FALSE
total edges= 34
missing edges= 0
non-missing edges= 34
Vertex attribute names:
vertex.names
No edge attributes
#Matrix to 2nd power for two step derived/reflected centrality measurement.
new4_adjacency2<-new4_adj %*% new4_adj
#Calculate the proportion of reflected centrality.
new_rc<-diag(as.matrix(new4_adjacency2))/rowSums(as.matrix(new4_adjacency2))
new_rc<-ifelse(is.nan(new_rc),0,new_rc)
head(new_rc)
Cedric Diggory Cho Chang Colin Creevey Dennis Creevey
0.1428571 0.0000000 0.1250000 0.1250000
Fred Weasley George Weasley
0.2000000 0.1904762
#Calculate the proportion of derived centrality.
new_dc<-1-diag(as.matrix(new4_adjacency2))/rowSums(as.matrix(new4_adjacency2))
new_dc<-ifelse(is.nan(new_dc),1,new_dc)
head(new_dc)
Cedric Diggory Cho Chang Colin Creevey Dennis Creevey
0.8571429 1.0000000 0.8750000 0.8750000
Fred Weasley George Weasley
0.8000000 0.8095238
Now we’ll add in our closesness, betweeness and brokerage scores:
#calculate closeness centrality: igraph
closeIG <- igraph::closeness(new4.ig)
head(igraph::closeness(new4.ig))
Cedric Diggory Cho Chang Colin Creevey Dennis Creevey
0.06666667 0.06250000 0.05000000 0.05000000
Fred Weasley George Weasley
0.08333333 0.07692308
[1] 0 0 0 0 0 0
closeIG_df <- data_frame(closeIG)
new4.nodes <- new4.nodes %>%
mutate(closeness = closeIG_df$closeIG)
#Betweeness Centrality
head(igraph::betweenness(new4.ig, directed=TRUE, weights=NA))
Cedric Diggory Cho Chang Colin Creevey Dennis Creevey
8 0 0 0
Fred Weasley George Weasley
9 0
betweenIG <- igraph::betweenness(new4.ig, directed=TRUE, weights=NA)
betweenIG_df <- data_frame(betweenIG)
new4.nodes <- new4.nodes %>%
mutate(between = betweenIG_df$betweenIG)
Adding constraint
constraintIG <- constraint(new4.ig)
constraintIG_df <- data_frame(constraintIG)
new4.nodes <- new4.nodes %>%
mutate(constraint = constraintIG_df$constraintIG)
Brokerage:
#return matrix of standardized brokerage scores
head(brokerage(new4.stat, cl = new4.nodes$name)$z.nli)
w_I w_O b_IO b_OI b_O t
Cedric Diggory NaN NaN NaN NaN -1.0136651 -1.0136651
Cho Chang NaN NaN NaN NaN -1.2430947 -1.2430947
Colin Creevey NaN NaN NaN NaN -1.2430947 -1.2430947
Dennis Creevey NaN NaN NaN NaN -1.2430947 -1.2430947
Fred Weasley NaN NaN NaN NaN -0.3253764 -0.3253764
George Weasley NaN NaN NaN NaN -1.2430947 -1.2430947
Quick Reminder of the types of brokerage:
Liaison: all three actors occupy different groups
Representative: one member of a subgroup has role (either assumed or assigned) of communicating/negotiating with outsiders
Gatekeeper: actor screens or gathers resources and distributes them to members of own group
Itinerant/Cosmopolitan: principals belong to same group while intermediary belongs to a different group
Coordinator: all actors belong to the same group
b_O: Liaison role; the broker mediates contact between two individuals from different groups, neither of which is the group to which he or she belongs. Two-path structure: A -> B -> C
In this evaluation, liaison and total brokerage are the same, so we’ll add it to the nodes df as liaison/total
brokerage <- brokerage(new4.stat, cl = new4.nodes$name)$z.nli
brokerage_df <- as.data.frame(brokerage)
new4.nodes <- new4.nodes %>%
mutate(liaison_total = brokerage_df$b_O)
[Note: I tried to add the Bonacich measures and they continue to fail.]
Now I’m going to try and run some network statistics based on our Week 9 tutorial code.
First, we’ll take a look at transitivity of our Harry Potter support network. We are using the reduced network that has all isolates removed.
Recall:
#new4.stat <- as.network.matrix(book4)
print(new4.stat)
Network attributes:
vertices = 12
directed = TRUE
hyper = FALSE
loops = FALSE
multiple = FALSE
bipartite = FALSE
total edges= 34
missing edges= 0
non-missing edges= 34
Vertex attribute names:
vertex.names
No edge attributes
sna::dyad.census(new4.stat)
Mut Asym Null
[1,] 12 10 44
Triad Census
sna::triad.census(new4.stat)
003 012 102 021D 021U 021C 111D 111U 030T 030C 201 120D 120U
[1,] 78 36 44 0 5 5 21 11 0 0 4 1 4
120C 210 300
[1,] 1 0 10
Global Transitivity
#get network transitivity: igraph
transitivity(new4.ig)
[1] 0.5106383
Graph Density
graph.density(new4.ig)
[1] 0.2575758
Now we’ll assess whether our HP Network is different from a random network.
First, condition on: size
[Do I need to use a matrix object? Or can I use a graph object? This is an SNA function.]
#compare network transitivity to null conditional on size
trans.cug<-cug.test(new4.stat, FUN=gtrans, mode="digraph", cmode="size")
trans.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: digraph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.6016949
Pr(X>=Obs): 0.016
Pr(X<=Obs): 0.984
Histogram with observed compared to replication statistics
#plot vs simulation results
plot(trans.cug)
From this test, conditioning on size, the probability that our HP Network transitivity is greater than random is p <= .015, which is stastitically significant (using p<=.05 as the measure for significance).
We can also test this:
#function to return t statistic
cug.t<-function(cug.object){
(cug.object$obs.stat-mean(cug.object$rep.stat))/sd(cug.object$rep.stat)
}
#t-stat between observed and simulated networks
cug.t(trans.cug)
[1] 2.100745
[This is where I had question on the original tutorial - In fact, if we wanted to know how many standard errors away the observed transitivity value is from what we would expect, on average, this can be calulated using the cug.test object as follows. So am I correct that this metric that’s returned is a standard error, not standard deviation? I still find this confusing!]
We can now run similar analyses using different metrics.
For this network, I’m also interested in conditioning on edges:
#compare network transitivity to null conditional on size
transE.cug<-cug.test(new4.stat, FUN=gtrans, mode="digraph", cmode="edges")
transE.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: edges
Graph Type: digraph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.6016949
Pr(X>=Obs): 0
Pr(X<=Obs): 1
plot(transE.cug)
I interpret this result to say that our HP network is significantly higher in transitivity (conditioning on edges) than random. This makes sense, particularly as this is a support network with isolates removed. Let’s run this on the full network and see what we get!
#compare network transitivity to null conditional on size
transEfull.cug<-cug.test(book4.stat, FUN=gtrans, mode="digraph", cmode="edges")
transEfull.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: edges
Graph Type: digraph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.6016949
Pr(X>=Obs): 0
Pr(X<=Obs): 1
plot(transEfull.cug)
Hm. So adding in our isolates doesn’t change anything about our transitivity?
Moving on!
Now I’m going to test on network degree centralization, again looking at our smaller network without isolates and our larger with the isolates.
without isolates
#compare network degree centralization to null conditional on size
c.degree.cugSmall <-cug.test(new4.stat,FUN=centralization, FUN.arg=list(FUN=degree, cmode="indegree"), mode="digraph", cmode="size")
#plot vs simulation results
plot(c.degree.cugSmall)
Again, a result that makes total sense, given the way this network was constructed.
with isolates
#compare network degree centralization to null conditional on size
c.degree.cugFull <-cug.test(book4.stat, FUN=centralization, FUN.arg=list(FUN=degree,
cmode="indegree"), mode="digraph", cmode="size")
c.degree.cugFull
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: digraph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.1365583
Pr(X>=Obs): 0.635
Pr(X<=Obs): 0.37
#plot vs simulation results
plot(c.degree.cugFull)
Now this looks very different! Let’s run the t statistic.
#t-stat between observed and simulated networks
cug.t(c.degree.cugFull)
[1] -0.4361202
So with our isolates included, this network does not look different from a random network!
Now let’s look at betweenness:
No Isolates
#compare network betweenness centralization to null conditional on size
b.degree.cugSmall <-cug.test(new4.stat, FUN=centralization, FUN.arg=list(FUN=betweenness, cmode="directed"), mode="digraph", cmode="size", reps=100)
b.degree.cugSmall
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: digraph
Diagonal Used: FALSE
Replications: 100
Observed Value: 0.3661157
Pr(X>=Obs): 0
Pr(X<=Obs): 1
#plot vs simulation results
plot(b.degree.cugSmall)
So, without our isolates, this is very different from a random network.
With Isolates
#compare network betweenness centralization to null conditional on size
b.degree.cugFll <-cug.test(book4.stat, FUN=centralization, FUN.arg=list(FUN=betweenness, cmode="directed"), mode="digraph", cmode="size", reps=100)
b.degree.cugFll
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: digraph
Diagonal Used: FALSE
Replications: 100
Observed Value: 0.0107107
Pr(X>=Obs): 0
Pr(X<=Obs): 1
#plot vs simulation results
plot(b.degree.cugFll)
This is very interesting to me! That even with our isolates, we still have a higher measure of betweenness for this network than random. I wonder if this is because Harry’s betweenness score is very high? And that drives up the whole network?
Now we’ll test if this network looks like a preferential attachment network comparing to our network with isolates removed.
library(intergraph)
#create empty dataframe for simulations
trials<-data.frame(id=1:100, gdens=NA, gtrans=NA, cent.deg=NA, cent.bet=NA)
#simulate PA networks and add stats to trials dataframe: size
for ( i in 1:100 ){
pa.ig<- igraph::sample_pa(n = network.size(new4.stat), directed=TRUE)
pa.stat<-intergraph::asNetwork(pa.ig)
trials$gdens<-gden(pa.stat)
trials$gtrans[i] <- gtrans(pa.stat)
trials$cent.deg[i] <- centralization(pa.stat, FUN=degree, cmode="indegree")
trials$cent.bet[i] <-centralization(pa.stat, FUN=betweenness)
}
summary(trials)
id gdens gtrans cent.deg
Min. : 1.00 Min. :0.08333 Min. :0 Min. :0.2066
1st Qu.: 25.75 1st Qu.:0.08333 1st Qu.:0 1st Qu.:0.3058
Median : 50.50 Median :0.08333 Median :0 Median :0.4050
Mean : 50.50 Mean :0.08333 Mean :0 Mean :0.4615
3rd Qu.: 75.25 3rd Qu.:0.08333 3rd Qu.:0 3rd Qu.:0.6033
Max. :100.00 Max. :0.08333 Max. :0 Max. :0.9008
cent.bet
Min. :0.006612
1st Qu.:0.023554
Median :0.036364
Mean :0.042810
3rd Qu.:0.058884
Max. :0.129752
sim.t<-function(g, trials){
temp<-data.frame(density=c(gden(g),mean(trials$gdens),sd(trials$gdens)),
transitivity=c(gtrans(g),mean(trials$gtrans),sd(trials$gtrans)),
indegCent=c(centralization(g, FUN=degree, cmode="indegree"),mean(trials$cent.deg), sd(trials$cent.deg)),
betwCent=c(centralization(g, FUN=betweenness), mean(trials$cent.bet), sd(trials$cent.bet)))
rownames(temp)<-c("Observed","Simulated", "SD")
temp<-data.frame(t(temp))
temp$tvalue<-(temp$Observed-temp$Simulated)/temp$SD
temp
}
plot.sim.t<-function(g,trials){
temp<-data.frame(net.stat=c("gtrans","cent.deg","cent.bet"), x=c(gtrans(g),centralization(g, FUN=degree, cmode="indegree"), centralization(g, FUN=betweenness)))
trials%>%
select(gtrans:cent.bet)%>%
gather(key="net.stat",value = "estimate")%>%
ggplot(aes(estimate)) +
geom_histogram() +
facet_wrap(net.stat ~ ., scales="free", ncol=3) +
geom_vline(data=temp, aes(xintercept=x),
linetype="dashed", size=1, colour="red")
}
#check for differences from null
sim.t(g=new4.stat,trials)
Observed Simulated SD tvalue
density 0.2575758 0.08333333 0.00000000 Inf
transitivity 0.6016949 0.00000000 0.00000000 Inf
indegCent 0.6115702 0.46148760 0.15963585 0.9401563
betwCent 0.3661157 0.04280992 0.02605203 12.4100023
plot.sim.t(new4.stat, trials)
So, this is interesting, which is that both betweenness and transitivity are very different from random, but degree centrality is not. So, even though this network looks random based on number of connections, it’s actually not, because of the nature of those connections? At least in comparison to a preferential attachment network.