Co-Author Network Analysis

Exploring the Co-Author Network

Lissie Bates-Haus, Ph.D. https://github.com/lbateshaus (U Mass Amherst DACSS MS Student)https://www.umass.edu/sbs/data-analytics-and-computational-social-science-program/ms
2022-05-14

Import co-author adjacency matrix:

setwd("~/DACSS/697E Network Analysis/Final Project")
library(readr)

authors_df <- read_csv("coauthorMatrix.csv", show_col_types = FALSE)
coauthorMatrix <- as.matrix(read_csv("coauthorMatrix.csv"), show_col_types = FALSE)

#This is the csv of my citations from EndNote
ethics_authors <- read_csv("ethics_authors.csv", show_col_types = FALSE)

#Removing the first column of names to create a square matrix
authors_adj <- coauthorMatrix[ , -1]
authors.df <- as.data.frame(authors_adj, show_col_types = FALSE)

dim(coauthorMatrix)
[1] 231 232
dim(authors_adj)
[1] 231 231

I’m going to create a vector of names as a node attributes:

library(tidyverse)
authorsNodes <- authors_df %>% select(1)

Create igprah object from matrix (this is a weighted, undirected, upper matrix):

library(igraph)
library(statnet)

authors.ig <- graph_from_adjacency_matrix(authors_adj, mode = "upper", weighted = TRUE)
authors.stat <- as.network.matrix(authors_adj)

Add measures to the nodes dataframe:

authorsNodes <- authorsNodes %>%

mutate(degreeTotal = igraph::degree(authors.ig, mode="total", loops=FALSE))

#note: undirected network, so just using total

newEigen <- centr_eigen(authors.ig, directed = F)

newEigen$vector
  [1] 0.083877582 0.070732200 0.083353694 0.076526021 0.083388285
  [6] 0.091519049 0.070732200 0.076526021 0.092603713 0.076149138
 [11] 0.070732200 0.000000000 0.079973532 0.075737122 0.000000000
 [16] 0.092603713 0.083353694 0.083353694 0.076526021 0.084196359
 [21] 0.098354074 0.093075125 0.076526021 0.076526021 0.004978261
 [26] 0.076526021 0.076936163 0.076526021 0.083353694 0.071087856
 [31] 0.000000000 0.077527172 0.071441750 0.083775990 0.082387033
 [36] 0.076149138 0.070732200 0.076526021 0.076526021 0.091519049
 [41] 0.076526021 0.086028365 0.078497403 0.083353694 0.076526021
 [46] 0.005896307 0.076969761 0.083353694 0.083353694 0.076526021
 [51] 0.103740115 0.103740115 0.076149138 0.076969761 0.070732200
 [56] 0.091519049 0.079973532 0.010772081 0.070732200 0.076936163
 [61] 0.097698888 0.096090350 0.076526021 0.076526021 0.082804033
 [66] 0.083353694 0.084633552 0.004978261 0.076526021 0.092598336
 [71] 0.070732200 0.076526021 0.076526021 0.076526021 0.133240774
 [76] 0.076526021 0.005956664 0.004978261 0.004978261 0.083353694
 [81] 0.070732200 0.096013504 0.083877582 0.102564453 0.077920018
 [86] 0.076089254 0.083353694 0.076526021 0.070732200 0.082387033
 [91] 0.004978261 0.098354074 0.083353694 0.083353694 0.076526021
 [96] 0.081943269 0.103740115 0.076526021 0.083353694 0.000000000
[101] 0.133240774 0.076526021 0.076526021 0.076526021 0.010772081
[106] 0.082037619 0.005956664 0.005028200 0.076526021 0.081943269
[111] 0.004978261 0.093075125 0.083458529 0.093075125 0.089749435
[116] 0.096090350 0.083353694 0.076526021 0.076526021 0.086028365
[121] 0.098139955 0.070732200 0.083353694 0.083914886 0.083353694
[126] 0.004978261 0.091519049 0.083388285 0.000000000 0.083973313
[131] 0.091519049 0.010772081 0.092598336 0.005910195 0.070732200
[136] 0.076911112 0.089814690 0.076526021 0.084305039 0.083458529
[141] 0.098139955 0.083353694 0.070732200 0.083584694 0.018123643
[146] 0.018160946 0.076526021 0.011182224 0.032600134 0.005355300
[151] 0.010772081 0.011215821 0.083353694 0.010772081 0.010772081
[156] 0.081943269 0.010772081 0.076526021 0.070732200 0.004978261
[161] 0.076526021 0.081943269 0.004978261 0.010772081 0.017599754
[166] 0.082804033 0.010772081 0.070732200 0.011773232 0.010772081
[171] 0.076526021 0.136281076 0.011805474 0.092598336 0.004978261
[176] 0.081943269 0.091519049 0.004978261 0.076526021 0.017599754
[181] 0.004978261 0.012743463 0.017599754 0.082324256 0.026849774
[186] 0.010772081 0.025631682 0.004978261 0.000000000 0.010772081
[191] 0.070732200 0.004978261 0.017599754 0.091519049 0.076936163
[196] 0.103615935 0.010031493 0.010772081 0.011182224 0.070732200
[201] 0.017704590 0.010772081 0.076555176 0.030336411 0.004978261
[206] 0.011655797 0.005933542 0.000000000 0.017599754 0.011215821
[211] 0.005794131 0.005413144 0.000000000 0.010801237 0.017599754
[216] 0.091519049 0.025765109 0.026844397 0.004978261 0.010772081
[221] 0.000000000 0.011182224 0.005484153 0.010772081 0.000000000
[226] 0.007218673 0.025765109 0.017634346 0.005956664 0.010772081
[231] 1.000000000
newEigen$centralization
[1] 0.947438

Now to run some CUG tests on this network! [mode = graph because this is undirected]

#compare network transitivity to null conditional on size
trans.cug<-cug.test(authors.stat, FUN=gtrans, mode="graph", cmode="size")
trans.cug

Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: graph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.07993769 
Pr(X>=Obs): 1 
Pr(X<=Obs): 0 
#plot vs simulation results
plot(trans.cug)

So, this network has much lower transitivity than a random network, when conditioned on size. This makes sense, given the isolation of the different components.

#function to return t statistic
cug.t<-function(cug.object){
  (cug.object$obs.stat-mean(cug.object$rep.stat))/sd(cug.object$rep.stat)
}

#t-stat between observed and simulated networks
cug.t(trans.cug)
[1] -138.0246

Does this result change if we condition on edges?

#compare network transitivity to null conditional on size
transE.cug<-cug.test(authors.stat, FUN=gtrans, mode="graph", cmode="edges")
transE.cug

Univariate Conditional Uniform Graph Test

Conditioning Method: edges 
Graph Type: graph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.07993769 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1 
plot(transE.cug)

#t-stat between observed and simulated networks
cug.t(transE.cug)
[1] 12.37051

This is really interesting to me, that based on edges, transitivity is much higher than would be expected in a random network. I think we already know this is not a random network, but it’s super interesting that there is such a difference when conditioning on size versus edges.

Now, given that we already know our centralization scores are low, I wouldn’t expect this to be different from random, but let’s see!

#compare network degree centralization to null conditional on size
c.degree.cug <-cug.test(authors.stat,FUN=centralization,  FUN.arg=list(FUN=degree, cmode="degree"), mode="graph", cmode="size") 
c.degree.cug

Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: graph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.9880197 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1 
#plot vs simulation results
plot(c.degree.cug)
#t-stat between observed and simulated networks
cug.t(c.degree.cug)
[1] 71.84315

Okay this was… not at all what I was expecting. Does this change if we condition on edges? Is this related to the fact that overall, all nodes are connected (because we dropped all our isolates)?

#compare network degree centralization to null conditional on edges
c.degreeE.cug <-cug.test(authors.stat,FUN=centralization,  FUN.arg=list(FUN=degree, cmode="degree"), mode="graph", cmode="edges") 
c.degree.cug

Univariate Conditional Uniform Graph Test

Conditioning Method: size 
Graph Type: graph 
Diagonal Used: FALSE 
Replications: 1000 

Observed Value: 0.9880197 
Pr(X>=Obs): 0 
Pr(X<=Obs): 1 
#plot vs simulation results
plot(c.degreeE.cug)
#t-stat between observed and simulated networks
cug.t(c.degreeE.cug)
[1] 188.5672

Same result for edges.

Now we’re going to take a look to see if this network looks like a preferential attachment network.

library(intergraph)

library(intergraph)

#create empty dataframe for simulations
trials<-data.frame(id=1:100, gdens=NA, gtrans=NA, cent.deg=NA, cent.bet=NA)

#simulate PA networks and add stats to trials dataframe: size
for ( i in 1:100 ){ 
  pa.ig<- igraph::sample_pa(n = network.size(authors.stat), directed=TRUE)
  pa.stat<-intergraph::asNetwork(pa.ig)
  trials$gdens<-gden(pa.stat)
  trials$gtrans[i] <- gtrans(pa.stat)
  trials$cent.deg[i] <- centralization(pa.stat, FUN=degree, cmode="indegree")
  trials$cent.bet[i] <-centralization(pa.stat, FUN=betweenness)
}

summary(trials)
       id             gdens              gtrans     cent.deg      
 Min.   :  1.00   Min.   :0.004329   Min.   :0   Min.   :0.05242  
 1st Qu.: 25.75   1st Qu.:0.004329   1st Qu.:0   1st Qu.:0.10482  
 Median : 50.50   Median :0.004329   Median :0   Median :0.12229  
 Mean   : 50.50   Mean   :0.004329   Mean   :0   Mean   :0.12962  
 3rd Qu.: 75.25   3rd Qu.:0.004329   3rd Qu.:0   3rd Qu.:0.14958  
 Max.   :100.00   Max.   :0.004329   Max.   :0   Max.   :0.25329  
    cent.bet        
 Min.   :0.0006394  
 1st Qu.:0.0013637  
 Median :0.0020271  
 Mean   :0.0024104  
 3rd Qu.:0.0031647  
 Max.   :0.0096876  
sim.t<-function(g, trials){
  temp<-data.frame(density=c(gden(g),mean(trials$gdens),sd(trials$gdens)),
             transitivity=c(gtrans(g),mean(trials$gtrans),sd(trials$gtrans)),
             indegCent=c(centralization(g, FUN=degree, cmode="indegree"),mean(trials$cent.deg), sd(trials$cent.deg)),
             betwCent=c(centralization(g, FUN=betweenness), mean(trials$cent.bet), sd(trials$cent.bet)))
  rownames(temp)<-c("Observed","Simulated", "SD")
  temp<-data.frame(t(temp))
  temp$tvalue<-(temp$Observed-temp$Simulated)/temp$SD
  temp
}
plot.sim.t<-function(g,trials){
  temp<-data.frame(net.stat=c("gtrans","cent.deg","cent.bet"), x=c(gtrans(g),centralization(g, FUN=degree, cmode="indegree"), centralization(g, FUN=betweenness)))
  trials%>%
    select(gtrans:cent.bet)%>%
    gather(key="net.stat",value = "estimate")%>%
    ggplot(aes(estimate)) +
    geom_histogram() +
    facet_wrap(net.stat ~ ., scales="free", ncol=3) +
    geom_vline(data=temp, aes(xintercept=x),
               linetype="dashed", size=1, colour="red")
}
#check for differences from null

sim.t(g=authors.stat,trials)
               Observed   Simulated         SD    tvalue
density      0.02053454 0.004329004 0.00000000       Inf
transitivity 0.07993769 0.000000000 0.00000000       Inf
indegCent    0.98372401 0.129623440 0.04148628  20.58754
betwCent     0.61878906 0.002410407 0.00141299 436.22301
plot.sim.t(authors.stat, trials)

SO, overall we can conclude that this network does not look like a preferential attachment network!