Question: Are properties of the commmunity structure of a literature predictive of measures of bias in that literature?
NOTE: Results are very different with bibliometrix package version 1.6 (this is 1.5). Why is this??
Parameters:
MIN_NUM_PAPERS <- 4 # per MA
# Analysis-network pairs of interest
ANALYSES <- c("co-citation", "collaboration","coupling", "co-occurrences")
NETWORKS <- c("references", "authors", "authors", "keywords")
Get dois for each paper in metalab dataset (obtained from: https://apps.crossref.org/SimpleTextQuery/)
# copy-pasting 50 at a time into search engine
dois <- read.csv("dois.csv") %>%
select(study_ID, doi) %>%
filter(doi != "" & !is.na(doi)) %>%
mutate(doi = ifelse(grepl(".org/", doi),
unlist(lapply(str_split(doi, ".org/"),
function(x) {x[2]})), as.character(doi)))
Scrape data from web of science at paper level using bibliometrix package
## Articles extracted 100
## Articles extracted 163
Number of papers with dois in web of science in each MA:
source("../../dashboard/global.R", chdir = TRUE) # all_data source
paper.data = all_data %>%
inner_join(dois, by="study_ID") %>%
mutate(doi = tolower(doi)) %>%
group_by(doi) %>%
select(short_name) %>%
slice(1)
# get web of science data for domain
ns = paper.data %>%
inner_join(wos, by = c("doi" = "DI")) %>%
group_by(short_name) %>%
summarize(n.papers = n())
kable(ns)
short_name | n.papers |
---|---|
catBias | 7 |
gaze_following | 12 |
idspref | 13 |
inphondb-native | 23 |
inphondb-nonnative | 4 |
inworddb | 49 |
labadv | 13 |
mutex | 16 |
phonotactics | 2 |
pointing_concurrent | 4 |
pointing_longitudinal | 1 |
sounds | 6 |
symbolism | 5 |
word_recognition | 6 |
paper.data = paper.data %>%
left_join(ns) %>%
filter(n.papers > MIN_NUM_PAPERS)
n.mas = length(unique(paper.data$short_name))
MAs with 4 or fewer papers are excluded. This leaves us with 10 MAs. Note that we’re losing papers here in two ways - those that don’t have dois (e.g. conference papers, n = approx. 40) and those with dois that are not in web of science (n = approx. 50).
getGraphData <- function (my.paper.data,
this.analysis,
this.network,
remove.multiple = TRUE,
noloops = FALSE) {
# FOR DEBUGGING
# my.paper.data = filter(paper.data, short_name == unique(paper.data$short_name)[2])
# this.analysis = "co-occurrences"
# this.network = "keywords"
# get web of science data for domain
this.wos = left_join(my.paper.data, wos, by = c("doi" = "DI")) %>%
as.data.frame() # necessary for biblioNetwork
sep = ifelse(this.analysis == "co-citation", ". ", "; ")
# make the network
this.net <- biblioNetwork(this.wos,
analysis = this.analysis,
network = this.network,
sep = sep)
if (length(this.net > 0)) {
# munge the network
graph <- graph.adjacency(this.net, mode = "undirected")
graph <- igraph::simplify(graph, remove.multiple = remove.multiple,
remove.loops = noloops)
# make into df for plotting
gn = asNetwork(graph)
}
cc <- try(ggnetwork(gn), silent = TRUE) # gets rid of too-small networks
if(!is(cc,"try-error")) {
return(ggnetwork(gn))
}
}
getModularity <- function (my.paper.data,
this.analysis,
this.network,
remove.multiple = FALSE,
noloops = FALSE) {
# get web of science data for domain
this.wos = left_join(my.paper.data, wos, by = c("doi" = "DI")) %>%
as.data.frame() # necessary for biblioNetwork
sep = ifelse(this.analysis == "co-citation", ". ", "; ")
# make the network
this.net <- biblioNetwork(this.wos,
analysis = this.analysis,
network = this.network,
sep = sep)
if (length(this.net > 0)) {
# munge the network
graph <- graph.adjacency(this.net, mode = "undirected")
graph <- igraph::simplify(graph, remove.multiple = remove.multiple,
remove.loops = noloops)
clustering = cluster_leading_eigen(graph, options = list(maxiter = 1000000))
closeness = mean(estimate_closeness(graph, cutoff = 100))
betweeness = mean(estimate_betweenness(graph, cutoff = 100))
degree = mean(igraph::degree(graph))
data.frame(analysis = this.analysis,
network = this.network,
Q = round(clustering$modularity,4),
n.groups = round(length(clustering),4),
closeness = round(closeness, 4),
betweeness = round(betweeness, 4),
degree = mean(degree))
}
}
# get network measures for all networks
analyses_list <- rep(ANALYSES, each = n.mas)
networks_list <- rep(NETWORKS, each = n.mas)
dfs <- paper.data %>%
mutate_each(funs(as.factor)) %>%
split(.$short_name) %>%
rep(length(ANALYSES))
args = list(dfs, analyses_list, networks_list)
modularity.data = args %>%
pmap(getModularity) %>%
bind_rows(.id = "short_name")
modularity.data.long = modularity.data %>%
mutate(closeness = log(closeness),
betweeness = log(betweeness),
degree = log(degree)) %>%
gather("network.measure", "network.value", 4:8) %>%
unite(network, analysis, network, sep = ".")
ggplot(modularity.data.long, aes(x = short_name,
y = network.value)) +
geom_bar(stat = "identity", aes(fill = short_name)) +
facet_wrap(network.measure ~ network,
ncol = 4, scales = "free") +
ggtitle("Network summary statistics") +
theme_bw() +
theme(axis.text.x = element_blank())
mean.ns = all_data %>%
rowwise() %>%
mutate(n_total = sum(c(n_1, n_2), na.rm = TRUE)) %>%
group_by(short_name) %>%
summarize(mean_n = mean(n_total))
ma.es = read.csv("ES_data_for_networks.csv") %>%
select(short_name, overall.d, overall.d.age,
fsn_string, egg.random.z) %>%
mutate(overall.d.age = abs(overall.d.age)) %>%
rename(fail_safe_n = fsn_string) %>%
left_join(mean.ns) %>%
gather("bias.measure", "bias.value", 2:6)
ma.es %>%
filter(short_name %in% modularity.data.long$short_name) %>%
ggplot(aes(x = short_name, y = bias.value)) +
geom_bar(stat = "identity", aes(fill = short_name)) +
facet_wrap(~bias.measure, scales = "free") +
ggtitle("Bias summary statistics") +
theme_bw() +
theme(axis.text.x = element_blank())
Get model fits. We’re predicting the bias value (e.g. fail-safe-n) with the network value (e.g. modularity), controlling for the number of papers in the MA. We’re fitting this model for each bias-network pair, for each network type (e.g. co-citation references).
net.es = left_join(modularity.data.long, ma.es) %>%
dplyr::as_data_frame() %>%
left_join(ns)
corr.sigs = net.es %>%
filter(is.finite(network.value) & is.finite(bias.value)) %>%
group_by(network, bias.measure, network.measure) %>%
#do(tidy(cor.test(.$bias.value, .$value, na.action = "na.pass"))) %>%
do(tidy(lm(bias.value ~ n.papers + network.value, data=.))) %>%
filter(term == "network.value") %>%
mutate(sig.col = ifelse(p.value < .05, "sig", "nsig")) %>%
select(estimate, sig.col) %>%
mutate(network.value = Inf, bias.value = Inf) # this is a hack
m = net.es %>%
filter(network.measure == "Q" & bias.measure == "overall.d.age") %>%
filter(is.finite(network.value) & is.finite(bias.value)) %>%
filter(network == "co-citation.references") %>%
as.data.frame() %>%
lm(bias.value ~ network.value, data= .) %>%
tidy()
Define plotting functions
network_plot <- function(d, analysis, network){
networkname <- paste(analysis, network, sep = ".")
d %>%
split(.$short_name) %>%
map(getGraphData, analysis, network) %>%
bind_rows(.id = "short_name") %>%
ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "grey50") +
facet_wrap(~short_name) +
geom_nodes(aes(color = short_name)) +
ggtitle(networkname) +
theme_blank() +
theme(legend.position = "none")
}
corr_plot <- function(d, corrs, analysis, network){
networkname <- paste(analysis, network, sep = ".")
ggplot(filter(d, network == networkname),
aes(x = network.value, y = bias.value)) +
geom_rect(data = filter(corrs, network == networkname),
aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(aes(size = n.papers, color = short_name)) +
geom_smooth(method= "lm", color = "black") +
ggtitle(networkname)+
facet_grid(bias.measure ~ network.measure, scales = "free") +
scale_fill_manual(values = c( "grey99", "red1")) +
theme_bw() +
theme(legend.position = "none")
}
Two articles are linked when both are cited in a third article.
p <- 1
network_plot(paper.data, ANALYSES[p], NETWORKS[p])
corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])
Size of the point corresponds to number of papers in MA. Red facets indicate models where the network measure is a significant predictor of the bias value (controling for number of papers).
A link is added between two keywords if they co-occur in the same paper.
p <- 4
network_plot(paper.data, ANALYSES[p], NETWORKS[p])
corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] sna_2.4 network_1.13.0 statnet.common_3.3.0
## [4] feather_0.3.1 shinydashboard_0.6.0 shiny_1.0.3
## [7] broom_0.4.2 igraph_1.0.1 intergraph_2.0-2
## [10] ggnetwork_0.5.1 bibliometrix_1.5 stringr_1.2.0
## [13] dplyr_0.5.0 purrr_0.2.2.2 readr_1.1.1
## [16] tibble_1.3.3 ggplot2_2.2.1 tidyverse_1.1.1
## [19] langcog_0.1.9001 tidyr_0.6.3 knitr_1.16
##
## loaded via a namespace (and not attached):
## [1] ggrepel_0.6.5 Rcpp_0.12.10 lubridate_1.6.0
## [4] lattice_0.20-34 rscopus_0.4.6 assertthat_0.2.0
## [7] rprojroot_1.2 digest_0.6.12 psych_1.7.5
## [10] mime_0.5 R6_2.2.1 cellranger_1.1.0
## [13] plyr_1.8.4 backports_1.0.5 evaluate_0.10
## [16] highr_0.6 httr_1.2.1 rlang_0.1.1
## [19] lazyeval_0.2.0 readxl_1.0.0 Matrix_1.2-8
## [22] rmarkdown_1.5 labeling_0.3 foreign_0.8-67
## [25] munsell_0.4.3 httpuv_1.3.3 modelr_0.1.0
## [28] mnormt_1.5-5 htmltools_0.3.6 flashClust_1.01-2
## [31] codetools_0.2-15 MASS_7.3-45 leaps_3.0
## [34] SnowballC_0.5.1 grid_3.3.3 xtable_1.8-2
## [37] nlme_3.1-131 jsonlite_1.4 gtable_0.2.0
## [40] factoextra_1.0.4 DBI_0.6-1 magrittr_1.5
## [43] scales_0.4.1 stringi_1.1.5 reshape2_1.4.2
## [46] scatterplot3d_0.3-40 xml2_1.1.1 RColorBrewer_1.1-2
## [49] tools_3.3.3 forcats_0.2.0 hms_0.3
## [52] parallel_3.3.3 yaml_2.1.14 colorspace_1.3-2
## [55] cluster_2.0.5 rvest_0.3.2 FactoMineR_1.35
## [58] haven_1.0.0
CS <- conceptualStructure(wos,field="ID", k.max=10, stemming=FALSE, labelsize = 1.5, minDegree = 2)
# Paper level analyses
getModularity_paper <- function (my.paper.data,
this.analysis,
this.network,
remove.multiple = FALSE,
remove.isolates = FALSE,
noloops = FALSE) {
# get web of science data for domain
my.M = left_join(my.paper.data, M, by = c("doi" = "DI")) %>%
as.data.frame() # necessary for biblioNetwork
SEP = ifelse(this.analysis == "co-citation", ". ", "; ")
# make the network
this.net <- biblioNetwork(my.M,
analysis = this.analysis,
network = this.network,
sep = SEP)
if (length(this.net > 0)) {
# munge the network
graph <- graph.adjacency(this.net, mode = "undirected")
data.frame(analysis = this.analysis,
network = this.network,
closeness = estimate_closeness(graph, cutoff= 100),
betweeness = estimate_betweenness(graph, cutoff= 100),
degree = igraph::degree(graph))
}
}
dfs <- paper.data %>%
mutate_each(funs(as.factor)) %>%
split(.$short_name) %>%
rep(4)
analyses <- rep(c("co-citation", "coupling", "co-occurrences", "collaboration"), each = 14)
networks <- rep(c("references", "authors", "keywords", "authors"), each = 14)
args = list(dfs, analyses, networks)
modularity.data = args %>%
pmap(getModularity_paper) %>%
filter(analysis == "co-citation") %>%
bind_rows(.id = "short_name")
k = M %>%
left_join(paper.data, by=c("DI"= "doi")) %>%
split(.$short_name) %>%
map(biblioAnalysis) %>%
bind_rows()
bibliometrix::summary(k, pause = FALSE)
S = summary(object = m, k = 10, pause = FALSE)
TO DO:
- non-significant vs. significant
- get summary statistics using summary function
- get ES controling for age, and ES error, also domain bias, n
- correlate everything at domain and entire dataset
- look at individual papers