library(readr)
library(textmineR)
library(tidyverse)
library(tm)
library(rAltmetric)
library(dplyr)
library(tidyr)
library(tidyselect)
library(forcats)
library(bipartiteD3)
library(igraph)
library(data.table)
In this example we begin with a Scopus data set and merge it with Altmetric data at a later stage. This is because we want to run topic modelling on abstract data, which is available from Scopus, but not Altmetric. Below I describe the steps needed to upload data from Scopus.
After we have agreed on search queries for Scopus, you ran the search. If the search returns less than 2000 publication (unlikely in our case), then you can just download it all at once. However, if there are more than 2000 publication there are several tricks. You can split downloads by years, for example, by limiting the search results to one year at a time.
If there are more than 2000 publication per one year, then an additional trick can be used. You can first sort the publications by the newest and export the data, and then filter again by the oldest and export the data again. If there is any overlap, it will be fixed in R by filtering uniques Titles. However, this method will only work if there are no more than 4000 publications per year. If there are more, than additional steps will be needed. But I doubt there will be more than 4000 per year.
Finally, it is important to gather all export files in one folder, because we will merge them in R.
rm(list=ls())
setwd("~/Desktop/R/Network Analysis Group Project/Scopus\ Exports/Ebola")
scopus_data <- list.files(full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows() %>%
as.data.frame() %>%
unique() %>%
filter(Abstract != "[No abstract available]") %>%
dplyr::mutate(id = row_number())
We run topic modelling using the LDA algorithm. For this we create a document term matrix and then calculate which terms are more likely to occur within whcih document. We then automatically create labels for each topic assign the labels to each document.
The stopwords can be decided after you first run the modelling. We will have to come back to this step, include the stopwords in this charachter vector and run the modelling again.
setwd("~/Desktop/R/Network Analysis Group Project")
my_stop_words <- c("ebola", "virus", "viruses", "disease", "diseases",
"open access", "creative commons", "eigenvalue", "infection", "ebov", "health")
dtm <- CreateDtm(doc_vec = scopus_data$Abstract, # textual data we use for modelling
doc_names = scopus_data$id, # document names
ngram_window = c(1, 2), # minimum and maximum n-gram length
stopword_vec = c(stopwords::stopwords("en"), # stopwords to be excluded from the text
stopwords::stopwords(source = "smart"),
my_stop_words), #additional stopwords
lower = TRUE, # convert every word to lowercase
remove_punctuation = TRUE, # remove punctuation
remove_numbers = TRUE, # remove numbers
verbose = FALSE)
dtm <- dtm[,colSums(dtm) > 2]
# Below we fit the document term matrix into the LDA model. This step might take some time and load your CPU. It may appear as your computer has frozen, but be patient and wait until the console shows that the modelling is over.
set.seed(12345)
model <- FitLdaModel(dtm = dtm,
k = 10,
iterations = 200,
burnin = 180,
alpha = 0.1,
beta = 0.05,
optimize_alpha = TRUE,
calc_likelihood = TRUE,
calc_coherence = TRUE,
calc_r2 = TRUE,
cpus = 2)
# Below are some additional steps that give descriptive information about the model, such as term prevalence and likelihood.
model$r2
plot(model$log_likelihood, type = "l")
model$top_terms <- GetTopTerms(phi = model$phi, M = 5)
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100
plot(model$prevalence, model$alpha, xlab = "prevalence", ylab = "alpha")
model$labels <- LabelTopics(assignments = model$theta > 0.05,
dtm = dtm,
M = 1)
# These lines will create return the model summary with automatically generated topics and most recurrent terms within each topic. If any of those terms seem to be unnecessarily vague or non-informative you should include them in the stopwords above and run the code again starting from the document term matrix (dtm).
model$summary <- data.frame(topic = rownames(model$phi),
label = model$labels,
coherence = round(model$coherence, 3),
prevalence = round(model$prevalence,3),
top_terms = apply(model$top_terms, 2, function(x){
paste(x, collapse = ", ")
}),
stringsAsFactors = FALSE)
model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]
## topic label_1 coherence prevalence
## t_10 t_10 west_africa 0.101 17.655
## t_1 t_1 hemorrhagic_fever 0.068 13.695
## t_8 t_8 glycoprotein_gp 0.237 12.928
## t_6 t_6 west_africa 0.073 9.864
## t_5 t_5 decomposition_evd 0.062 9.320
## t_9 t_9 hemorrhagic_fever 0.206 8.881
## t_7 t_7 sierra_leone 0.120 8.323
## t_3 t_3 external_ventricular 0.364 7.009
## t_2 t_2 real_time 0.143 6.319
## t_4 t_4 sierra_leone 0.072 6.006
## top_terms
## t_10 public, global, outbreak, response, research
## t_1 human, viral, fever, infections, infectious
## t_8 viral, vp, protein, cells, cell
## t_6 outbreak, evd, cases, epidemic, transmission
## t_5 model, data, based, evd, method
## t_9 vaccine, gp, vaccines, antibodies, antibody
## t_7 care, evd, patients, workers, training
## t_3 evd, patients, csf, ventricular, external
## t_2 detection, samples, laboratory, assay, diagnostic
## t_4 evd, patients, clinical, ci, survivors
# These are our topic labels from the final model with stopwords included
model$labels
## label_1
## t_1 "hemorrhagic_fever"
## t_2 "real_time"
## t_3 "external_ventricular"
## t_4 "sierra_leone"
## t_5 "decomposition_evd"
## t_6 "west_africa"
## t_7 "sierra_leone"
## t_8 "glycoprotein_gp"
## t_9 "hemorrhagic_fever"
## t_10 "west_africa"
# convert labels to a data frame so we can merge
label_df <- data.frame(topic = rownames(model$labels), label = model$labels,stringsAsFactors = FALSE)
# get the top topic for each document
top_topics <- apply(model$theta, 1, function(x) names(x)[which.max(x)][1])
# convert the top topics for each document so we can merge
top_topics <- data.frame(document = names(top_topics), top_topic = top_topics, stringsAsFactors = FALSE) # merge together. Now each document has a label from its top topic
top_topics <- merge(top_topics, label_df, by.x = "top_topic", by.y = "topic", all.x = TRUE)
# we merge the topics data set with the original scopus data set
doc_topics <- merge(top_topics, scopus_data, by.x = "document", by.y = "id")
head(doc_topics)
write.csv(doc_topics, file = "doc_topics.csv")
# This bit does not work for the time being. There are some issues we have to resolve with the API license in order to automate this step. For now, we just use DOIs from the doc_topics.csv file to manually export Altmetric data from Altmetric Explorer.
doc_topics_DOI <- doc_topics %>%
filter(!is.na(DOI))
ids <- list(doc_topics_DOI$DOI)
alm <- function(x) altmetrics(doi = x, apikey = getOption("854eee8c2356b95c8c527ebf0d5fa893")) %>%
altmetric_data()
api_alt_data <- pmap_df(ids, alm)
# Before proceeding to this step, you need to go to the doc_topics.csv file, copy all DOIs and insert them in the Altmetric Explorer. Then you run a search and export two additional data sets: 1) Policy mentions of all publications (Policy_ebola) tracked by Altmetric 2) General Altmetric export (Altmetric_ebola) from the 'Research Outputs' tab.
alt_policy_mentions <- read.csv("/Users/dmitrymalkov/Desktop/R/Network\ Analysis\ Group\ Project/Policy_ebola.csv") %>%
data.frame %>%
select(Outlet.or.Author, Mention.Title, Country, Research.Output.Title) %>%
rename(Title = Research.Output.Title)
general_alt_data <- read_csv("/Users/dmitrymalkov/Desktop/R/Network\ Analysis\ Group\ Project/Altmetric_ebola.csv") %>%
data.frame %>%
select(Funder, Title, Policy.mentions)
# Here we merge all data sets to get a complete data set ready for network analysis using the bipartite D3 package
complete_df <- merge(doc_topics, alt_policy_mentions, by.x = "Title", by.y ="Title")
complete_df <- merge(complete_df, general_alt_data, by.x = "Title", by.y = "Title")
complete_df <- complete_df %>%
separate_rows(Funder, sep = "; ", convert = FALSE) %>%
# The bipartiteD3 package takes specific variable names, such as higher and lower. So below we convert our variable names into the right format for visualisation.
rename(higher = label_1) %>%
rename(lower = Funder) %>%
rename(freq = Policy.mentions)
# We create an additional observation for all publications where funder information is not available
complete_df$lower[is.na(complete_df$lower)] <-"Funder not available"
complete_df$webID <- "Funders Topics"
graph_data_tri <- complete_df %>%
select(lower, higher, Outlet.or.Author, freq) %>%
arrange(freq) %>%
filter(freq > 0)
graph_data_tri$lower <- paste("F", graph_data_tri$lower, sep="_")
graph_data_tri$higher <- paste("T", graph_data_tri$higher, sep="_")
graph_data_tri$Outlet.or.Author <- paste("P", graph_data_tri$Outlet.or.Author, sep="_")
fund_publ <- graph_data_tri %>%
select (lower, higher) %>%
graph.data.frame(directed=FALSE) %>%
as_adjacency_matrix(names=TRUE, sparse=F)
publ_poli <- graph_data_tri %>%
select (higher, Outlet.or.Author) %>%
graph_from_data_frame(directed = FALSE) %>%
as_adjacency_matrix(names=TRUE, sparse=F)
el_fund_publ <- as.data.frame(as.table(fund_publ), stringsAsFactors = FALSE)
el_publ_poli <- as.data.frame(as.table(publ_poli), stringsAsFactors = FALSE)
fund_publ_poli <- rbind(el_fund_publ, el_publ_poli)
g <- graph.data.frame( fund_publ_poli[fund_publ_poli$Freq != 0 ,])
V(g)$type <- substr(V(g)$name, 1, 1)
adj <- get.adjacency(g, sparse=FALSE, attr="Freq")
t <- match(V(g)$type, c("F", "T", "P"))
type <- V(g)$type
type <- type %>%
str_replace("F", "circle") %>%
str_replace("T", "square") %>%
str_replace("P", "circle")
l <- layout_with_kk(g)
set.seed(12345)
plot(g, layout = l, vertex.color=t, directed = FALSE, vertex.shape = type, vertex.size = 2,
edge.arrow.size= 0, vertex.label = NA, margin = 0.1, asp =0.4, edge.width = 0.1,
vertex.frame.color= "black")
l <- layout_with_fr(g, dim = 3, miny=t, maxy=t, weights = NULL)
plot(g, layout = l, vertex.color=t, directed = FALSE, vertex.shape = type, vertex.size = 2,
edge.arrow.size= 0, vertex.label = NA, margin = 0.1, asp =0.4, edge.width = 0.1,
vertex.frame.color= "black")
### funder stats
btw <- as.data.frame(betweenness(g, v = V(g)[V(g)$type == "F"]))
btw <- setDT(btw, keep.rownames = TRUE)[]
btw <- btw %>%
mutate(betweenness = betweenness(g, v = V(g)[V(g)$type == "F"])) %>%
mutate(name = rn) %>%
select(name, betweenness)
dgr <- as.data.frame(degree(g, v = V(g)[V(g)$type == "F"], mode = "all"))
dgr <- setDT(dgr, keep.rownames = TRUE)[]
dgr <- dgr %>%
mutate(degree = degree(g, v = V(g)[V(g)$type == "F"], mode = "all")) %>%
mutate(name = rn) %>%
select(name, degree)
stats_F <- merge(dgr, btw, by.x = "name", by.y = "name")
stats_F$name <- stats_F$name %>%
str_replace("F_", "")
stats_F <- stats_F[order(-stats_F$degree),]
head(stats_F)
## name degree betweenness
## 56 National Institute of Allergy and Infectious Diseases 10 465.5225
## 63 National Institute of General Medical Sciences 10 465.5225
## 30 Funder not available 8 182.5227
## 45 Medical Research Council 8 182.5227
## 49 National Cancer Institute 8 333.6774
## 93 Wellcome Trust 8 182.5227
In case of Ebola, I’m only including data on publication with MORE than 6 policy mentions. This is a hight threshold, but without it, the visualisation looks bad. There are too many funders and they mess up with the visualisation. You can try yourself. Here it is crucial to tweak graph parameters until the visualisation looks acceptable. To see all graph parameters see vignette ?bipartiteD3
graph_data <- complete_df %>%
select(higher, lower, webID, freq) %>%
arrange(freq) %>%
filter(freq > 6)
graph_data$higher <- gsub("clinical_trials", "Clinical Trials", graph_data$higher)
graph_data$higher <- gsub("hemorrhagic_fever", "Hemorrhagic Fever", graph_data$higher)
graph_data$higher <- gsub("sierra_leone", "West Africa", graph_data$higher)
graph_data$higher <- gsub("west_africa", "West Africa", graph_data$higher)
graph_data <- graph_data %>%
filter(higher != "prevention_control")
order <- as.vector(graph_data$lower)
bipartite::frame2webs(graph_data)-> bi_graph
# We assign colours manually to each topic so that they match with the second bipartite graph (next section)
ManualColours<- c("Hemorrhagic Fever"="green", "West Africa"='red')
# And here is the resulting graph
bipartite_D3(bi_graph,
MainFigSize = c(2600,550), # canvas size (x,y)
colouroption = "manual",
NamedColourVector = ManualColours,
ColourBy = 2,
SiteNames = " ",
PercPos = c(480,200), # position of percentages (right, left)
BoxLabPos = c(20, 20), # distance between the labels and the bar
Pad = 7, # spacing between the labels
PrimaryLab = "Funders",
SecondaryLab = "Publication Topics",
SortPrimary = order,
MinWidth = 2,
BarSize = 10)
# My experiments with tripartite Sankey diagrams have not been very successful. They just don't look so sleek and there are less customisable parameters. Therefore, my proposal is to produce a second bipartite graph using bipartiteD3 and then graphically merge them together to create a tripartite network. Merging the graphs will not involve coding, but graphic editing. I thing it's ok for the purpose of visalisation.
complete_df2 <- complete_df %>%
mutate(lower = higher) %>%
mutate(higher = Outlet.or.Author)
complete_df2$higher2 <- as.character(complete_df2$higher)
complete_df2$higher2[is.na(complete_df2$higher)] <-"Policy Information n/a"
complete_df2$higher <- as.character(complete_df2$higher2)
complete_df2$webID <- "Policy mentions"
graph_data2 <- complete_df2 %>%
select(higher, lower, webID, freq) %>%
filter(freq > 6)
graph_data2$lower <- gsub("clinical_trials", "Clinical Trials", graph_data2$lower)
graph_data2$lower <- gsub("hemorrhagic_fever", "Hemorrhagic Fever", graph_data2$lower)
graph_data2$lower <- gsub("sierra_leone", "West Africa", graph_data2$lower)
graph_data2$lower<- gsub("west_africa", "West Africa", graph_data2$lower)
graph_data2 <- graph_data2 %>%
filter(lower != "prevention_control")
bipartite::frame2webs(graph_data2)-> bi_graph2
# And here is the resulting graph
bipartite_D3(bi_graph2,
MainFigSize = c(2000,550),
colouroption = "manual",
NamedColourVector = ManualColours,
ColourBy = 1,
SiteNames = ".",
PercPos = c(200,500),
BoxLabPos = c(20, 20),
Pad = 7,
PrimaryLab = " ",
SecondaryLab = "Policy Organisations",
MinWidth = 2,
BarSize = 10)
# After merging both networks in a graphic editor the result should look similar to this.
Linking Funders with Policy Organisations through research publications on Ebola
# We use separate software (JHive) to produce Hive plots, but the data is prepared and exported from R. Below are the steps that you need to take.
setwd("~/Desktop/R/Network Analysis Group Project/Scopus\ Exports/Ebola")
scopus_hive <- list.files(full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows() %>%
as.data.frame() %>%
unique() %>%
dplyr::mutate(id = row_number())
hive_data <- merge(scopus_hive, alt_policy_mentions, by.x="Title", by.y="Title", all.x = TRUE)
hive_data <- merge(hive_data, general_alt_data, by.x="Title", by.y="Title", all.x = TRUE)
hive_data$Funder[is.na(hive_data$Funder)] <-"Funder not available"
hive_data$Policy.mentions[is.na(hive_data$Policy.mentions) |
hive_data$Policy.mentions == 0] <-"No policy mentions"
hive_data <- hive_data %>%
separate_rows(Funder, sep = "; ", convert = FALSE) %>%
mutate(lab1 = Funder, lab2 = Title, lab3 = Policy.mentions) %>%
select(lab1, lab2, lab3)
edge_list <-
hive_data %>%
mutate(row = row_number()) %>%
gather('column', 'lab1', -row) %>%
mutate(column = match(column, names(hive_data))) %>%
group_by(row) %>%
arrange(column) %>%
mutate(lab2 = lead(lab1)) %>%
mutate(lab3 = lead(lab2, n = 1)) %>%
select(-lab3) %>%
ungroup() %>%
filter(!is.na(lab2)) %>%
select(lab1,lab2)
m <- as.matrix(get.adjacency(graph.data.frame(edge_list)))
m1 <- graph_from_adjacency_matrix(m)
# Below we are exporting our network data in .dot format, which is required by JHive. Once the file is saved, further processing takes place in JHive.
write_graph(m1, "graph.dot", format = "dot")
# The Hive plot will look similar to this.
Linking Funders with Policy Organisations through research publications