Question: Are properties of the commmunity structure of a literature predictive of measures of bias in that literature?

NOTE: Results are very different with bibliometrix package version 1.6 (this is 1.5). Why is this??



Parameters:

MIN_NUM_PAPERS <- 4 # per MA

# Analysis-network pairs of interest
ANALYSES <- c("co-citation", "collaboration","coupling", "co-occurrences")
NETWORKS <- c("references", "authors", "authors", "keywords")

Read and process data

(1) Web of Science data

Get dois for each paper in metalab dataset (obtained from: https://apps.crossref.org/SimpleTextQuery/)

# copy-pasting 50 at a time into search engine
dois <- read.csv("dois.csv") %>%
      select(study_ID, doi) %>%
      filter(doi != "" & !is.na(doi)) %>%
      mutate(doi = ifelse(grepl(".org/", doi),
                          unlist(lapply(str_split(doi, ".org/"), 
                                   function(x) {x[2]})), as.character(doi)))

Scrape data from web of science at paper level using bibliometrix package

## Articles extracted   100 
## Articles extracted   163

Number of papers with dois in web of science in each MA:

source("../../dashboard/global.R", chdir = TRUE) # all_data source

paper.data = all_data %>%
  inner_join(dois, by="study_ID") %>%
  mutate(doi = tolower(doi)) %>%
  group_by(doi) %>%
  select(short_name) %>%
  slice(1)

# get web of science data for domain
ns = paper.data %>%
  inner_join(wos, by = c("doi" = "DI")) %>%
  group_by(short_name) %>%
  summarize(n.papers = n()) 

kable(ns)
short_name n.papers
catBias 7
gaze_following 12
idspref 13
inphondb-native 23
inphondb-nonnative 4
inworddb 49
labadv 13
mutex 16
phonotactics 2
pointing_concurrent 4
pointing_longitudinal 1
sounds 6
symbolism 5
word_recognition 6
paper.data = paper.data %>%
  left_join(ns) %>%
  filter(n.papers > MIN_NUM_PAPERS) 

n.mas = length(unique(paper.data$short_name))

MAs with 4 or fewer papers are excluded. This leaves us with 10 MAs. Note that we’re losing papers here in two ways - those that don’t have dois (e.g. conference papers, n = approx. 40) and those with dois that are not in web of science (n = approx. 50).

(2) Network data

getGraphData <- function (my.paper.data, 
                          this.analysis,
                          this.network,
                          remove.multiple = TRUE,
                          noloops = FALSE) {
    
    # FOR DEBUGGING
    # my.paper.data = filter(paper.data, short_name == unique(paper.data$short_name)[2])
    # this.analysis = "co-occurrences"
    # this.network = "keywords"
  
    # get web of science data for domain
    this.wos = left_join(my.paper.data, wos, by = c("doi" = "DI")) %>%
             as.data.frame() # necessary for biblioNetwork
    
    sep = ifelse(this.analysis == "co-citation", ". ", "; ")
  
    # make the network
    this.net <- biblioNetwork(this.wos, 
                   analysis = this.analysis, 
                   network = this.network, 
                   sep = sep) 
    
    if (length(this.net > 0)) {
      # munge the network
      graph <- graph.adjacency(this.net, mode = "undirected")
      graph <- igraph::simplify(graph, remove.multiple = remove.multiple, 
                          remove.loops = noloops)
      
      # make into df for plotting
      gn = asNetwork(graph)
    }
    
    cc <- try(ggnetwork(gn), silent = TRUE)  # gets rid of too-small networks
    
    if(!is(cc,"try-error")) {
       return(ggnetwork(gn))
    } 
}

(3) Network measures

getModularity <- function (my.paper.data, 
                        this.analysis,
                        this.network,
                        remove.multiple = FALSE,
                        noloops = FALSE) {

    # get web of science data for domain
    this.wos = left_join(my.paper.data, wos, by = c("doi" = "DI")) %>%
             as.data.frame() # necessary for biblioNetwork
    
    sep = ifelse(this.analysis == "co-citation", ". ", "; ")
  
    # make the network
    this.net <- biblioNetwork(this.wos, 
                   analysis = this.analysis, 
                   network = this.network, 
                   sep = sep) 
    
    if (length(this.net > 0)) {
      # munge the network
      graph <- graph.adjacency(this.net, mode = "undirected")
      graph <- igraph::simplify(graph, remove.multiple = remove.multiple, 
                                remove.loops = noloops)
      
      clustering = cluster_leading_eigen(graph, options = list(maxiter = 1000000))
      closeness = mean(estimate_closeness(graph, cutoff = 100))
      betweeness = mean(estimate_betweenness(graph, cutoff = 100))
      degree = mean(igraph::degree(graph))

      data.frame(analysis = this.analysis,
                network = this.network,
                Q = round(clustering$modularity,4), 
                n.groups = round(length(clustering),4),
                closeness  = round(closeness, 4),
                betweeness  = round(betweeness, 4),
                degree = mean(degree))
    }
}

# get network measures for all networks
analyses_list <- rep(ANALYSES, each = n.mas) 
networks_list <- rep(NETWORKS, each = n.mas)

dfs <- paper.data %>% 
  mutate_each(funs(as.factor)) %>%
  split(.$short_name) %>%
  rep(length(ANALYSES))

args = list(dfs, analyses_list, networks_list)

modularity.data = args %>%
  pmap(getModularity) %>%
  bind_rows(.id = "short_name")

modularity.data.long = modularity.data %>%
    mutate(closeness = log(closeness),
            betweeness = log(betweeness),
            degree = log(degree)) %>%
    gather("network.measure", "network.value", 4:8) %>%
    unite(network, analysis, network, sep = ".")
ggplot(modularity.data.long, aes(x = short_name, 
                                 y = network.value)) +
        geom_bar(stat = "identity", aes(fill = short_name))  +
        facet_wrap(network.measure ~ network, 
                   ncol = 4, scales = "free") +
        ggtitle("Network summary statistics") +
        theme_bw() +
        theme(axis.text.x = element_blank()) 

(4) Literature bias data

mean.ns = all_data %>%
  rowwise() %>%
  mutate(n_total = sum(c(n_1, n_2), na.rm = TRUE)) %>%
  group_by(short_name) %>%
  summarize(mean_n = mean(n_total))

ma.es = read.csv("ES_data_for_networks.csv") %>%
  select(short_name, overall.d, overall.d.age,
         fsn_string, egg.random.z) %>% 
  mutate(overall.d.age = abs(overall.d.age)) %>%
  rename(fail_safe_n = fsn_string) %>%
  left_join(mean.ns) %>%
  gather("bias.measure", "bias.value", 2:6) 
ma.es %>%
  filter(short_name %in% modularity.data.long$short_name) %>%
  ggplot(aes(x = short_name, y = bias.value)) +
        geom_bar(stat = "identity", aes(fill = short_name))  +
        facet_wrap(~bias.measure, scales = "free") +
        ggtitle("Bias summary statistics") +
        theme_bw() +
        theme(axis.text.x = element_blank()) 

Networks and correlations with bias

Get model fits. We’re predicting the bias value (e.g. fail-safe-n) with the network value (e.g. modularity), controlling for the number of papers in the MA. We’re fitting this model for each bias-network pair, for each network type (e.g. co-citation references).

net.es = left_join(modularity.data.long, ma.es) %>%
  dplyr::as_data_frame() %>%
  left_join(ns)

corr.sigs = net.es %>%
  filter(is.finite(network.value) & is.finite(bias.value)) %>%
  group_by(network, bias.measure, network.measure) %>%
  #do(tidy(cor.test(.$bias.value, .$value, na.action = "na.pass"))) %>%
  do(tidy(lm(bias.value ~ n.papers + network.value, data=.))) %>%
  filter(term == "network.value") %>%
  mutate(sig.col = ifelse(p.value < .05, "sig", "nsig")) %>%
  select(estimate, sig.col) %>%
  mutate(network.value = Inf, bias.value = Inf) # this is a hack
m = net.es %>%
  filter(network.measure == "Q" & bias.measure == "overall.d.age") %>% 
  filter(is.finite(network.value) & is.finite(bias.value)) %>%
  filter(network == "co-citation.references") %>%
  as.data.frame()  %>%
  lm(bias.value ~ network.value, data= .) %>%
  tidy()

Define plotting functions

network_plot <- function(d, analysis, network){
  networkname <- paste(analysis, network, sep = ".")
  
  d %>%
    split(.$short_name) %>%
    map(getGraphData, analysis, network) %>%
    bind_rows(.id = "short_name") %>%
    ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
       geom_edges(color = "grey50") +
       facet_wrap(~short_name) +
       geom_nodes(aes(color = short_name)) +
       ggtitle(networkname) +
       theme_blank() +
       theme(legend.position = "none")
}

corr_plot <- function(d, corrs, analysis, network){
  networkname <- paste(analysis, network, sep = ".")
  
  ggplot(filter(d, network == networkname), 
       aes(x = network.value, y = bias.value)) +
     geom_rect(data = filter(corrs, network == networkname),
              aes(fill = sig.col), 
            xmin = -Inf, xmax = Inf,
            ymin = -Inf, ymax = Inf, alpha = 0.2) +
    geom_point(aes(size = n.papers, color = short_name)) +
    geom_smooth(method= "lm", color = "black") +
    ggtitle(networkname)+
    facet_grid(bias.measure ~ network.measure, scales = "free") +
    scale_fill_manual(values = c( "grey99", "red1")) +
    theme_bw() +
    theme(legend.position = "none")
}

Co-citation references

Two articles are linked when both are cited in a third article.

Networks

p <- 1
network_plot(paper.data, ANALYSES[p], NETWORKS[p])

Correlations

corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])

Size of the point corresponds to number of papers in MA. Red facets indicate models where the network measure is a significant predictor of the bias value (controling for number of papers).

Collaboration authors

Scientific collaboration network is a network where nodes are authors and links are co-authorships.

Networks

p <- 2
network_plot(paper.data, ANALYSES[p], NETWORKS[p])

Correlations

corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])

Coupling authors

Two articles are said to be bibliographically coupled if at least one author in a cited source appears in the bibliographies of both articles (Kessler, 1963).

Networks

p <- 3
network_plot(paper.data, ANALYSES[p], NETWORKS[p])

Correlations

corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])

Co-occurences keywords

A link is added between two keywords if they co-occur in the same paper.

Networks

p <- 4
network_plot(paper.data, ANALYSES[p], NETWORKS[p])

Correlations

corr_plot(net.es, corr.sigs, ANALYSES[p], NETWORKS[p])

Session info

sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] sna_2.4              network_1.13.0       statnet.common_3.3.0
##  [4] feather_0.3.1        shinydashboard_0.6.0 shiny_1.0.3         
##  [7] broom_0.4.2          igraph_1.0.1         intergraph_2.0-2    
## [10] ggnetwork_0.5.1      bibliometrix_1.5     stringr_1.2.0       
## [13] dplyr_0.5.0          purrr_0.2.2.2        readr_1.1.1         
## [16] tibble_1.3.3         ggplot2_2.2.1        tidyverse_1.1.1     
## [19] langcog_0.1.9001     tidyr_0.6.3          knitr_1.16          
## 
## loaded via a namespace (and not attached):
##  [1] ggrepel_0.6.5        Rcpp_0.12.10         lubridate_1.6.0     
##  [4] lattice_0.20-34      rscopus_0.4.6        assertthat_0.2.0    
##  [7] rprojroot_1.2        digest_0.6.12        psych_1.7.5         
## [10] mime_0.5             R6_2.2.1             cellranger_1.1.0    
## [13] plyr_1.8.4           backports_1.0.5      evaluate_0.10       
## [16] highr_0.6            httr_1.2.1           rlang_0.1.1         
## [19] lazyeval_0.2.0       readxl_1.0.0         Matrix_1.2-8        
## [22] rmarkdown_1.5        labeling_0.3         foreign_0.8-67      
## [25] munsell_0.4.3        httpuv_1.3.3         modelr_0.1.0        
## [28] mnormt_1.5-5         htmltools_0.3.6      flashClust_1.01-2   
## [31] codetools_0.2-15     MASS_7.3-45          leaps_3.0           
## [34] SnowballC_0.5.1      grid_3.3.3           xtable_1.8-2        
## [37] nlme_3.1-131         jsonlite_1.4         gtable_0.2.0        
## [40] factoextra_1.0.4     DBI_0.6-1            magrittr_1.5        
## [43] scales_0.4.1         stringi_1.1.5        reshape2_1.4.2      
## [46] scatterplot3d_0.3-40 xml2_1.1.1           RColorBrewer_1.1-2  
## [49] tools_3.3.3          forcats_0.2.0        hms_0.3             
## [52] parallel_3.3.3       yaml_2.1.14          colorspace_1.3-2    
## [55] cluster_2.0.5        rvest_0.3.2          FactoMineR_1.35     
## [58] haven_1.0.0
CS <- conceptualStructure(wos,field="ID",  k.max=10, stemming=FALSE, labelsize = 1.5, minDegree = 2)
# Paper level analyses
getModularity_paper <- function (my.paper.data, 
                        this.analysis,
                        this.network,
                        remove.multiple = FALSE, 
                        remove.isolates = FALSE,
                        noloops = FALSE) {

    # get web of science data for domain
    my.M = left_join(my.paper.data, M, by = c("doi" = "DI")) %>%
             as.data.frame() # necessary for biblioNetwork
    
    SEP = ifelse(this.analysis == "co-citation", ". ", "; ")
  
    # make the network
    this.net <- biblioNetwork(my.M, 
                   analysis = this.analysis, 
                   network = this.network, 
                   sep = SEP) 
    
    if (length(this.net > 0)) {
      # munge the network
      graph <- graph.adjacency(this.net, mode = "undirected")
      data.frame(analysis = this.analysis,
                 network = this.network,
                 closeness = estimate_closeness(graph, cutoff= 100),
                 betweeness = estimate_betweenness(graph, cutoff= 100),
                 degree = igraph::degree(graph))
    }
}

dfs <- paper.data %>% 
  mutate_each(funs(as.factor)) %>%
  split(.$short_name) %>%
  rep(4)
analyses <- rep(c("co-citation", "coupling", "co-occurrences", "collaboration"), each = 14) 
networks <- rep(c("references", "authors", "keywords", "authors"), each = 14)
args = list(dfs, analyses, networks)

modularity.data = args %>%
  pmap(getModularity_paper) %>%
  filter(analysis == "co-citation") %>%
  bind_rows(.id = "short_name")
k = M %>%
    left_join(paper.data, by=c("DI"= "doi")) %>%
    split(.$short_name) %>%
    map(biblioAnalysis) %>%
    bind_rows() 

bibliometrix::summary(k, pause = FALSE) 


S = summary(object = m, k = 10, pause = FALSE)
TO DO: 
  - non-significant vs. significant
  - get summary statistics using summary function
  - get ES controling for age, and ES error, also domain bias, n
  - correlate everything at domain and entire dataset
  - look at individual papers