Exploring the Co-Author Network
Import co-author adjacency matrix:
setwd("~/DACSS/697E Network Analysis/Final Project")
library(readr)
authors_df <- read_csv("coauthorMatrix.csv", show_col_types = FALSE)
coauthorMatrix <- as.matrix(read_csv("coauthorMatrix.csv"), show_col_types = FALSE)
#This is the csv of my citations from EndNote
ethics_authors <- read_csv("ethics_authors.csv", show_col_types = FALSE)
#Removing the first column of names to create a square matrix
authors_adj <- coauthorMatrix[ , -1]
authors.df <- as.data.frame(authors_adj, show_col_types = FALSE)
dim(coauthorMatrix)
[1] 231 232
dim(authors_adj)
[1] 231 231
I’m going to create a vector of names as a node attributes:
Create igprah object from matrix (this is a weighted, undirected, upper matrix):
library(igraph)
library(statnet)
authors.ig <- graph_from_adjacency_matrix(authors_adj, mode = "upper", weighted = TRUE)
authors.stat <- as.network.matrix(authors_adj)
Add measures to the nodes dataframe:
authorsNodes <- authorsNodes %>%
mutate(degreeTotal = igraph::degree(authors.ig, mode="total", loops=FALSE))
#note: undirected network, so just using total
newEigen <- centr_eigen(authors.ig, directed = F)
newEigen$vector
[1] 0.083877582 0.070732200 0.083353694 0.076526021 0.083388285
[6] 0.091519049 0.070732200 0.076526021 0.092603713 0.076149138
[11] 0.070732200 0.000000000 0.079973532 0.075737122 0.000000000
[16] 0.092603713 0.083353694 0.083353694 0.076526021 0.084196359
[21] 0.098354074 0.093075125 0.076526021 0.076526021 0.004978261
[26] 0.076526021 0.076936163 0.076526021 0.083353694 0.071087856
[31] 0.000000000 0.077527172 0.071441750 0.083775990 0.082387033
[36] 0.076149138 0.070732200 0.076526021 0.076526021 0.091519049
[41] 0.076526021 0.086028365 0.078497403 0.083353694 0.076526021
[46] 0.005896307 0.076969761 0.083353694 0.083353694 0.076526021
[51] 0.103740115 0.103740115 0.076149138 0.076969761 0.070732200
[56] 0.091519049 0.079973532 0.010772081 0.070732200 0.076936163
[61] 0.097698888 0.096090350 0.076526021 0.076526021 0.082804033
[66] 0.083353694 0.084633552 0.004978261 0.076526021 0.092598336
[71] 0.070732200 0.076526021 0.076526021 0.076526021 0.133240774
[76] 0.076526021 0.005956664 0.004978261 0.004978261 0.083353694
[81] 0.070732200 0.096013504 0.083877582 0.102564453 0.077920018
[86] 0.076089254 0.083353694 0.076526021 0.070732200 0.082387033
[91] 0.004978261 0.098354074 0.083353694 0.083353694 0.076526021
[96] 0.081943269 0.103740115 0.076526021 0.083353694 0.000000000
[101] 0.133240774 0.076526021 0.076526021 0.076526021 0.010772081
[106] 0.082037619 0.005956664 0.005028200 0.076526021 0.081943269
[111] 0.004978261 0.093075125 0.083458529 0.093075125 0.089749435
[116] 0.096090350 0.083353694 0.076526021 0.076526021 0.086028365
[121] 0.098139955 0.070732200 0.083353694 0.083914886 0.083353694
[126] 0.004978261 0.091519049 0.083388285 0.000000000 0.083973313
[131] 0.091519049 0.010772081 0.092598336 0.005910195 0.070732200
[136] 0.076911112 0.089814690 0.076526021 0.084305039 0.083458529
[141] 0.098139955 0.083353694 0.070732200 0.083584694 0.018123643
[146] 0.018160946 0.076526021 0.011182224 0.032600134 0.005355300
[151] 0.010772081 0.011215821 0.083353694 0.010772081 0.010772081
[156] 0.081943269 0.010772081 0.076526021 0.070732200 0.004978261
[161] 0.076526021 0.081943269 0.004978261 0.010772081 0.017599754
[166] 0.082804033 0.010772081 0.070732200 0.011773232 0.010772081
[171] 0.076526021 0.136281076 0.011805474 0.092598336 0.004978261
[176] 0.081943269 0.091519049 0.004978261 0.076526021 0.017599754
[181] 0.004978261 0.012743463 0.017599754 0.082324256 0.026849774
[186] 0.010772081 0.025631682 0.004978261 0.000000000 0.010772081
[191] 0.070732200 0.004978261 0.017599754 0.091519049 0.076936163
[196] 0.103615935 0.010031493 0.010772081 0.011182224 0.070732200
[201] 0.017704590 0.010772081 0.076555176 0.030336411 0.004978261
[206] 0.011655797 0.005933542 0.000000000 0.017599754 0.011215821
[211] 0.005794131 0.005413144 0.000000000 0.010801237 0.017599754
[216] 0.091519049 0.025765109 0.026844397 0.004978261 0.010772081
[221] 0.000000000 0.011182224 0.005484153 0.010772081 0.000000000
[226] 0.007218673 0.025765109 0.017634346 0.005956664 0.010772081
[231] 1.000000000
newEigen$centralization
[1] 0.947438
Now to run some CUG tests on this network! [mode = graph because this is undirected]
#compare network transitivity to null conditional on size
trans.cug<-cug.test(authors.stat, FUN=gtrans, mode="graph", cmode="size")
trans.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: graph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.07993769
Pr(X>=Obs): 1
Pr(X<=Obs): 0
#plot vs simulation results
plot(trans.cug)
So, this network has much lower transitivity than a random network, when conditioned on size. This makes sense, given the isolation of the different components.
#function to return t statistic
cug.t<-function(cug.object){
(cug.object$obs.stat-mean(cug.object$rep.stat))/sd(cug.object$rep.stat)
}
#t-stat between observed and simulated networks
cug.t(trans.cug)
[1] -138.0246
Does this result change if we condition on edges?
#compare network transitivity to null conditional on size
transE.cug<-cug.test(authors.stat, FUN=gtrans, mode="graph", cmode="edges")
transE.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: edges
Graph Type: graph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.07993769
Pr(X>=Obs): 0
Pr(X<=Obs): 1
plot(transE.cug)
#t-stat between observed and simulated networks
cug.t(transE.cug)
[1] 12.37051
This is really interesting to me, that based on edges, transitivity is much higher than would be expected in a random network. I think we already know this is not a random network, but it’s super interesting that there is such a difference when conditioning on size versus edges.
Now, given that we already know our centralization scores are low, I wouldn’t expect this to be different from random, but let’s see!
#compare network degree centralization to null conditional on size
c.degree.cug <-cug.test(authors.stat,FUN=centralization, FUN.arg=list(FUN=degree, cmode="degree"), mode="graph", cmode="size")
c.degree.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: graph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.9880197
Pr(X>=Obs): 0
Pr(X<=Obs): 1
#plot vs simulation results
plot(c.degree.cug)
#t-stat between observed and simulated networks
cug.t(c.degree.cug)
[1] 71.84315
Okay this was… not at all what I was expecting. Does this change if we condition on edges? Is this related to the fact that overall, all nodes are connected (because we dropped all our isolates)?
#compare network degree centralization to null conditional on edges
c.degreeE.cug <-cug.test(authors.stat,FUN=centralization, FUN.arg=list(FUN=degree, cmode="degree"), mode="graph", cmode="edges")
c.degree.cug
Univariate Conditional Uniform Graph Test
Conditioning Method: size
Graph Type: graph
Diagonal Used: FALSE
Replications: 1000
Observed Value: 0.9880197
Pr(X>=Obs): 0
Pr(X<=Obs): 1
#plot vs simulation results
plot(c.degreeE.cug)
#t-stat between observed and simulated networks
cug.t(c.degreeE.cug)
[1] 188.5672
Same result for edges.
Now we’re going to take a look to see if this network looks like a preferential attachment network.
library(intergraph)
library(intergraph)
#create empty dataframe for simulations
trials<-data.frame(id=1:100, gdens=NA, gtrans=NA, cent.deg=NA, cent.bet=NA)
#simulate PA networks and add stats to trials dataframe: size
for ( i in 1:100 ){
pa.ig<- igraph::sample_pa(n = network.size(authors.stat), directed=TRUE)
pa.stat<-intergraph::asNetwork(pa.ig)
trials$gdens<-gden(pa.stat)
trials$gtrans[i] <- gtrans(pa.stat)
trials$cent.deg[i] <- centralization(pa.stat, FUN=degree, cmode="indegree")
trials$cent.bet[i] <-centralization(pa.stat, FUN=betweenness)
}
summary(trials)
id gdens gtrans cent.deg
Min. : 1.00 Min. :0.004329 Min. :0 Min. :0.05242
1st Qu.: 25.75 1st Qu.:0.004329 1st Qu.:0 1st Qu.:0.10482
Median : 50.50 Median :0.004329 Median :0 Median :0.12229
Mean : 50.50 Mean :0.004329 Mean :0 Mean :0.12962
3rd Qu.: 75.25 3rd Qu.:0.004329 3rd Qu.:0 3rd Qu.:0.14958
Max. :100.00 Max. :0.004329 Max. :0 Max. :0.25329
cent.bet
Min. :0.0006394
1st Qu.:0.0013637
Median :0.0020271
Mean :0.0024104
3rd Qu.:0.0031647
Max. :0.0096876
sim.t<-function(g, trials){
temp<-data.frame(density=c(gden(g),mean(trials$gdens),sd(trials$gdens)),
transitivity=c(gtrans(g),mean(trials$gtrans),sd(trials$gtrans)),
indegCent=c(centralization(g, FUN=degree, cmode="indegree"),mean(trials$cent.deg), sd(trials$cent.deg)),
betwCent=c(centralization(g, FUN=betweenness), mean(trials$cent.bet), sd(trials$cent.bet)))
rownames(temp)<-c("Observed","Simulated", "SD")
temp<-data.frame(t(temp))
temp$tvalue<-(temp$Observed-temp$Simulated)/temp$SD
temp
}
plot.sim.t<-function(g,trials){
temp<-data.frame(net.stat=c("gtrans","cent.deg","cent.bet"), x=c(gtrans(g),centralization(g, FUN=degree, cmode="indegree"), centralization(g, FUN=betweenness)))
trials%>%
select(gtrans:cent.bet)%>%
gather(key="net.stat",value = "estimate")%>%
ggplot(aes(estimate)) +
geom_histogram() +
facet_wrap(net.stat ~ ., scales="free", ncol=3) +
geom_vline(data=temp, aes(xintercept=x),
linetype="dashed", size=1, colour="red")
}
#check for differences from null
sim.t(g=authors.stat,trials)
Observed Simulated SD tvalue
density 0.02053454 0.004329004 0.00000000 Inf
transitivity 0.07993769 0.000000000 0.00000000 Inf
indegCent 0.98372401 0.129623440 0.04148628 20.58754
betwCent 0.61878906 0.002410407 0.00141299 436.22301
plot.sim.t(authors.stat, trials)
SO, overall we can conclude that this network does not look like a preferential attachment network!