Current preprint available here.
# Libraries
pacman::p_load(tidyverse, countrycode, sf, state, htmltools, htmlwidgets, urltools, janitor, DT, RColorBrewer, plotly, readxl, rvest)
# raw beacon data
df <- read_csv(here::here("data/beacon-public.csv"))
# ancillary data and vectors for cleaning and plotting
iso_to_name <- read_csv(here::here("data/country-iso3166-to-name.csv"))
canada.states <-
c(
"Alberta", "British Columbia", "Labrador", "Manitoba", "New Brunswick", "Newfoundland", "Nova Scotia", "Nunavut", "North West Terr.", "Ontario", "Prince Edward Is.", "Québec (Province)", "Saskatchewan", "Yukon"
)
# world map shapefile
shapefile <-
read_sf(here::here("data/TM_WORLD_BORDERS_SIMPL-0.3.shp")) %>%
clean_names() %>%
filter(name %in% c("Western Sahara", "Morocco")) %>% # handling boundary issues for morocco and western sahara
mutate(name = if_else(name == "Western Sahara", "Morocco", name)) %>%
arrange(name) %>%
group_by(name) %>%
summarize(
geometry = st_union(geometry),
lat = first(lat),
lon = first(lon),
region = first(region),
subregion = first(subregion)
) %>%
st_cast("MULTIPOLYGON") %>%
bind_rows(
read_sf(here::here("data/TM_WORLD_BORDERS_SIMPL-0.3.shp")) %>%
clean_names() %>%
filter(!(name %in% c("Western Sahara", "Morocco")))
)# Cleaning oai urls
df <-
df %>%
mutate(oai_url = if_else(str_detect(oai_url, "^//"), str_c("http:", oai_url), oai_url))
# Getting domain, tld, and merging in country names
df <-
bind_cols(
df,
df %>% pull(oai_url) %>% domain() %>% tld_extract()
) %>%
mutate(country_consolidated = str_to_lower(country_consolidated) %>% str_trim()) %>%
left_join(iso_to_name, by = c("country_consolidated" = "tld"))
# Cleaning country names
df <-
df %>%
mutate(
country = country_clean,
country = if_else(country == "Washington (State)", "United States", country),
country = if_else(str_detect(country, "United States|New York|District of Columbia"), "United States", country),
country = if_else(country %in% state.name, "United States", country),
country = if_else(country %in% canada.states, "Canada", country),
country = if_else(str_detect(country, "China"), "China", country),
country = if_else(str_detect(country, "Armenia"), "Armenia", country),
country = if_else(str_detect(country, "Georgia"), "Georgia", country),
country = if_else(str_detect(country, "Australia|New South Wales|Queensland|Victoria"), "Australia", country),
country = if_else(str_detect(country, "England|British"), "United Kingdom", country),
country = if_else(str_detect(country, "Russia|Soviet Union"), "Russia", country),
country = if_else(str_detect(country, "Palestine"), "Palestine", country),
country = if_else(country == "Korea (South)", "South Korea", country)
) %>%
select(-country_clean, -country_marc, -country_issn, -country_tld, -country_ip)
# Mapping countries to continents
continents <-
df %>%
pull(country) %>%
countrycode(origin = "country.name", destination = "continent")
df <- df %>% bind_cols(continents)
# Rename last column to `continent`
names(df)[length(names(df))] <- "continent"# Filter to active OJS journals
df <-
df %>%
filter(application == "ojs", record_count_2020 >= 5) %>%
distinct(oai_url, repository_name, set_spec, .keep_all = T)Total active journals using OJS (JUOJS):
df %>% count()Total articles published in active JUOJS:
df %>% summarise(total = sum(record_count_2020, na.rm = T))Average articles published in active JUOJS:
df %>% summarise(total = mean(record_count_2020, na.rm = T))Total distinct ISSNs:
df %>%
mutate(
issn = str_extract(issn, "[^\n]+") # correcting for multiple reported issns
) %>%
distinct(issn) %>%
drop_na(issn) %>%
count()# global df
df_world <-
df %>%
drop_na(country) %>%
count(country, name = "total")
labels <- function(x) {
if_else(x < 500, as.character(x), "500+")
}
shapefile %>%
clean_names() %>%
rename(country = name) %>%
mutate(
country = if_else(country == "Libyan Arab Jamahiriya", "Libya", country),
country = if_else(country == "United Republic of Tanzania", "Tanzania", country),
country = if_else(country == "Cote d'Ivoire", "Côte d'Ivoire", country),
country = if_else(country == "Congo", "Republic of the Congo", country),
country = if_else(country == "Viet Nam", "Vietnam", country),
country = if_else(str_detect(country, "Iran"), "Iran", country),
country = if_else(str_detect(country, "Korea, Republic of"), "South Korea", country),
country = if_else(str_detect(country, "Korea, Democratic People's Republic of"), "North Korea", country),
country = if_else(str_detect(country, "Surinam"), "Surinam", country)
) %>%
left_join(df_world, by = "country") %>% #arrange(area) %>% select(country, total, area)
filter(total > 0 | area > 1000) %>%
filter(country != "Antarctica") %>%
mutate(
total = replace_na(total, 0),
total = pmin(total, 500)
) %>%
ggplot() +
geom_sf(aes(fill = total), size = 0.1, color = "gray", show.legend = T) +
scale_fill_gradientn(
breaks = seq(0, 500, 100),
labels = labels,
colors = RColorBrewer::brewer.pal(n = 9, name = "Blues")
) +
guides(
fill =
guide_colorbar(
barheight = 0.5,
barwidth = 15,
title = "Journals",
title.vjust = 1,
)
) +
theme_void() +
theme(legend.position = "bottom") read_csv(here::here("data/beacon-public.csv")) %>%
select(context_name, record_count_2010:record_count_2020) %>%
pivot_longer(cols = starts_with("record_count")) %>%
mutate(
name = parse_number(name)
) %>%
filter(value >= 5) %>%
count(name) %>%
mutate(name = as.integer(name)) %>%
ggplot(aes(name, n)) +
geom_line() +
geom_point(size = 2) +
theme_classic() +
scale_x_continuous(breaks = seq(2010, 2020, 1)) +
scale_y_continuous(breaks = seq(0, 25000, 5000)) +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 10),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5)
) +
labs(
x = "Year",
y = "Journals",
)Total overlap:
df_wos <-
read_table(here::here("data/overlaps/wos.txt"), na = c("NULL", "NA", "")) %>%
clean_names() %>%
remove_empty() %>%
filter(str_detect(issn, "^[0-9]{4}-[0-9]{3}[0-9xX]$")) %>%
distinct(issn) %>%
drop_na(issn)
# total overlap
df %>%
mutate(
issn = str_extract(issn, "[^\n]+")
) %>%
inner_join(df_wos, by = "issn") %>%
count()Top 10 countries in overlap:
# top 10 countries in overlap
df %>%
mutate(
issn = str_extract(issn, "[^\n]+")
) %>%
inner_join(df_wos, by = "issn") %>%
count(country) %>%
arrange(-n) %>%
head(10) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 18),
axis.ticks = element_blank()
) +
coord_flip() +
labs(
x = "Country", y = "Number of journals"
)Total overlap:
# Scopus data
df_scopus <-
read_excel(here::here("data/overlaps/scopus.xlsx")) %>%
clean_names() %>%
remove_empty() %>%
transmute(
issn = str_replace(print_issn, "-", ""),
e_issn = str_replace(e_issn, "-", "")
) %>%
distinct() %>%
remove_empty()
# join A (issn to issn)
df_join_a <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+"),
issn = str_replace(issn, "-", "")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_scopus %>% select(-e_issn), by = "issn")
# join B (issn to e-issn)
df_join_b <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+"),
issn = str_replace(issn, "-", "")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_scopus %>% select(-issn), by = c("issn" = "e_issn"))
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
count()# 1646/41957
# 1646/22809Top 10 countries in overlap:
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
drop_na(country) %>%
count(country) %>%
arrange(-n) %>%
head(10) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 18),
axis.ticks = element_blank()
) +
coord_flip() +
labs(
x = "Country", y = "Number of journals"
)Total overlap:
# Dimensions data
df_dimensions <-
read_excel(here::here("data/overlaps/dimensions.xlsx")) %>%
clean_names() %>%
remove_empty() %>%
transmute(
issn = str_replace(issn_print, "-", "") %>% na_if("NULL"),
e_issn = str_replace(issn_e, "-", "") %>% na_if("NULL")
) %>%
distinct() %>%
remove_empty()
# join A (issn to issn)
df_join_a <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+"),
issn = str_replace(issn, "-", "")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_dimensions %>% select(-e_issn), by = "issn")
# join B (issn to e-issn)
df_join_b <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+"),
issn = str_replace(issn, "-", "")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_dimensions %>% select(-issn), by = c("issn" = "e_issn"))
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
count()#12435/22809
#12435/72990Top 10 countries in overlap:
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
drop_na(country) %>%
count(country) %>%
arrange(-n) %>%
head(10) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 18),
axis.ticks = element_blank()
) +
coord_flip() +
labs(
x = "Country", y = "Number of journals"
)Total Overlap:
# EBSCO Host
df_ebsco <-
read_excel(here::here("data/overlaps/ebscohost.xls")) %>%
clean_names() %>%
remove_empty() %>%
distinct(issn) %>%
drop_na(issn)
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+")
) %>%
inner_join(df_ebsco, by = "issn") %>%
count()Top 10 countries in overlap:
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+")
) %>%
inner_join(df_ebsco, by = "issn") %>%
drop_na(country) %>%
count(country) %>%
arrange(-n) %>%
head(10) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
coord_flip() +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 10),
axis.ticks = element_blank()
) +
labs(
x = "Country", y = "Number of journals"
)# Read in python processed google scholar URLs
## first iteration using journal URLs ##
df_gscholar <-
read_csv(here::here("scripts/gscholar_urls_mapped.csv")) %>%
select(-result_json) %>%
mutate(n_results = parse_integer(n_results))
df_gscholar <-
df %>%
separate(oai_url, c("url1", "url2"), "\n") %>%
remove_empty() %>%
rename(oai_url = url1) %>%
filter(!str_detect(oai_url, "\\?page=")) %>%
transmute(
country, total_record_count, context_name,
url = str_replace(oai_url, "index/oai", set_spec),
url = str_replace(url, "^https://", ""),
url = str_replace(url, "^http://", ""),
url = str_replace(url, "^www108.", ""),
url = str_replace(url, "^www5.", ""),
url = str_replace(url, "^www3.", ""),
url = str_replace(url, "^www2.", ""),
url = str_replace(url, "^www.", ""),
) %>%
bind_rows(
df %>%
separate(oai_url, c("url1", "url2"), "\n") %>%
remove_empty() %>%
rename(oai_url = url1) %>%
filter(str_detect(oai_url, "\\?page=")) %>%
transmute(
country, total_record_count, context_name,
url = str_replace(oai_url, "oai$", set_spec),
url = str_replace(url, "^https://", ""),
url = str_replace(url, "^http://", ""),
url = str_replace(url, "^www108.", ""),
url = str_replace(url, "^www5.", ""),
url = str_replace(url, "^www3.", ""),
url = str_replace(url, "^www2.", ""),
url = str_replace(url, "^www.", ""),
), .
) %>%
distinct() %>%
inner_join(df_gscholar, by = "url")
# bind_cols(
# df_gscholar,
# df_gscholar %>% pull(url) %>% domain() %>% tld_extract()
# ) %>%
# left_join(read_csv(here::here("scripts/gscholar_citations.csv")), by = "url") %>%
# group_by(domain) %>%
# summarise(
# n_results = sum(n_results, na.rm = T),
# n_citations = sum(n_citations, na.rm = T)
# ) %>%
# filter(n_results > 0) %>%
# select(-n_results) %>%
# write_csv(here::here("data/scholar_present_domains_set1.csv"))Total overlap:
# loading all domains
domains_in_scholar <-
bind_rows(
read_csv(here::here("data/scholar_present_domains_set1.csv")),
read_csv(here::here("data/scholar_present_domains_set2.csv")),
) %>%
distinct()
# total overlap
bind_cols(
df_gscholar,
df_gscholar %>% pull(url) %>% domain() %>% tld_extract()
) %>%
inner_join(domains_in_scholar, by = "domain") %>%
count()#22679/25671
#domains_in_scholar %>% summary()
#domains_in_scholar %>% filter(n_citations < 1)
#552/8548Top 10 countries in overlap:
# top 10 countries in overlap
bind_cols(
df_gscholar,
df_gscholar %>% pull(url) %>% domain() %>% tld_extract()
) %>%
inner_join(domains_in_scholar, by = "domain") %>%
count(country) %>%
arrange(-n) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
head(10) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
scale_y_continuous(breaks = scales::breaks_width(1000)) +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 10),
axis.ticks = element_blank()
) +
coord_flip() +
labs(x = "Country", y = "Total journals")Number of citations on first Scholar page:
# number of citations on first gscholar page
domains_in_scholar %>%
mutate(n_citations = pmin(n_citations, 500)) %>%
ggplot(aes(n_citations)) +
geom_histogram(binwidth = 20, fill = "#0072B2") +
scale_x_continuous(labels = c("0", "100", "200", "300", "400", "500+")) +
hrbrthemes::theme_ipsum() +
labs(
x = "Total citations on first page",
y = "Number of journal domains"
) +
theme_classic() +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.ticks = element_blank()
)# ggsave("scholar_citations.png")Journals with non-zero citations on first page:
# alter threshold to see proportions as needed (e.g. 0, 1000, 10000)
threshold <- 0
domains_in_scholar %>% arrange(-n_citations) %>% filter(n_citations > threshold) %>% count()Summary of citations on first page:
domains_in_scholar %>% summary()## domain n_citations
## Length:8548 Min. : 0.0
## Class :character 1st Qu.: 11.0
## Mode :character Median : 50.0
## Mean : 358.8
## 3rd Qu.: 185.0
## Max. :84656.0
Total Overlap:
# Latindex
df_latindex <-
read_excel(here::here("data/overlaps/latindex.xlsx")) %>%
clean_names() %>%
remove_empty() %>%
select(issn, e_issn, online = en_linea) %>%
distinct()
# join A (issn to issn)
df_join_a <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_latindex %>% select(-e_issn), by = "issn")
# join B (issn to e-issn)
df_join_b <-
df %>%
drop_na(issn) %>%
transmute(
country,
issn = str_extract(issn, "[^\n]+")
) %>%
distinct(issn, .keep_all = T) %>%
inner_join(df_latindex %>% select(-issn), by = c("issn" = "e_issn"))
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
arrange(country, issn, -online) %>%
distinct(country, issn) %>%
count()# 4208/24486
# 4208/6319Top 10 countries in overlap:
bind_rows(df_join_a, df_join_b) %>%
distinct() %>%
arrange(country, issn, -online) %>%
distinct(country, issn) %>%
drop_na(country) %>%
count(country) %>%
arrange(-n) %>%
head(10) %>%
mutate(country = fct_inorder(country) %>% fct_rev()) %>%
ggplot(aes(country, n)) +
geom_col(fill = "#0072B2") +
coord_flip() +
labs(
x = "Country", y = "Number of journals"
) +
theme_classic() +
theme(
axis.title = element_text(size = 20),
axis.text = element_text(size = 18),
axis.ticks = element_blank()
)Latin American countries for JUOJS:
# Latin American countries for JUOJS
df_latam <- read_csv(here::here("data/latindex_countries.csv"))
df %>%
inner_join(df_latam, by = "country") %>%
count()df_lang_disc <-
read_csv(here::here("data/OJS_languages_disciplines.csv")) %>%
select(journal_url, language, discipline, journal_name = context_name) %>%
remove_empty()
ojs_domains <-
df %>%
mutate(domain = str_remove_all(domain, "^www.")) %>%
distinct(domain) %>%
pull(domain) %>%
suffix_extract() %>%
transmute(domain = str_c(domain, suffix, sep = ".")) %>%
distinct(domain) %>%
pull(domain)
bind_cols(
df_lang_disc,
df_lang_disc %>% pull(journal_url) %>% domain() %>% tld_extract()
) %>%
transmute(
domain = str_replace(domain, "^https://", ""),
domain = str_replace(domain, "^http://", ""),
domain = str_replace(domain, "^www108.", ""),
domain = str_replace(domain, "^www5.", ""),
domain = str_replace(domain, "^www3.", ""),
domain = str_replace(domain, "^www2.", ""),
domain = str_replace(domain, "^www.", ""),
language, discipline
) %>%
distinct(domain, .keep_all = T) %>%
inner_join(domains_in_scholar, by = "domain") %>%
arrange(desc(n_citations)) %>%
mutate(domain = str_c('<a href="http://', domain, '" target="_blank">', domain, '</a>')) %>%
select(`Journal Domain` = domain, `Language` = language, `Discipline` = discipline, `Scholar First Page Citations` = n_citations) %>%
head(100) %>%
DT::datatable(escape = F, options = list(pageLength = 10))process_html <- function(html_content) {
df_mini <-
tibble(
article = html_content %>%
html_nodes(".gs_ri .gs_rt a") %>%
html_text(),
domain = html_content %>%
html_nodes(".gs_ri .gs_rt a") %>%
html_attr("href") %>%
domain() %>%
suffix_extract() %>%
transmute(domain = str_c(domain, suffix, sep = ".")) %>%
pull(domain),
# citations = html_content %>%
# html_nodes(".gs_ri .gs_fl a") %>%
# html_text() %>%
# str_subset("^Cited by ") %>%
# parse_number()
) %>%
mutate(
domain_in_ojs = if_else(domain %in% ojs_domains, "Yes", "No"),
domain = if_else(domain == "google.com", "books.google.com", domain)
) %>%
# arrange(desc(citations)) %>%
clean_names(case = "title") %>%
rename(`JUOJS Domain` = `Domain in Ojs`, `Article/Book` = `Article`)
return(df_mini)
}html_content1 <- read_html("https://scholar.google.com/scholar?cites=3069375885583311800&as_sdt=2005&sciodt=0,5&hl=en&inst=574688794952177237")
html_content2 <- read_html("https://scholar.google.com/scholar?start=10&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content3 <- read_html("https://scholar.google.com/scholar?start=20&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content4 <- read_html("https://scholar.google.com/scholar?start=30&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content5 <- read_html("https://scholar.google.com/scholar?start=40&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content6 <- read_html("https://scholar.google.com/scholar?start=50&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content7 <- read_html("https://scholar.google.com/scholar?start=60&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content8 <- read_html("https://scholar.google.com/scholar?start=70&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content9 <- read_html("https://scholar.google.com/scholar?start=80&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
html_content10 <- read_html("https://scholar.google.com/scholar?start=90&hl=en&as_sdt=2005&sciodt=0,5&cites=3069375885583311800&scipsc=&inst=5746887945952177237")
bind_rows(
`1` = process_html(html_content1),
`2` = process_html(html_content2),
`3` = process_html(html_content3),
`4` = process_html(html_content4),
`5` = process_html(html_content5),
`6` = process_html(html_content6),
`7` = process_html(html_content7),
`8` = process_html(html_content8),
`9` = process_html(html_content9),
`10` = process_html(html_content10),
.id = "Page"
) %>% view()
write_csv(here::here("data/citation_samples/english.csv"))read_csv(here::here("data/citation_samples/english.csv")) %>%
select(-Citations) %>%
DT::datatable() JUOJS vs non-JUOJS domains among citations:
read_csv(here::here("data/citation_samples/english.csv")) %>%
count(`JUOJS Domain`, name = "Count")html_content1 <- read_html("https://scholar.google.com/scholar?cites=6422621221779132632&as_sdt=2005&sciodt=0,5&hl=en&inst=5746887945952177237")
html_content2 <- read_html("https://scholar.google.com/scholar?start=10&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content3 <- read_html("https://scholar.google.com/scholar?start=20&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content4 <- read_html("https://scholar.google.com/scholar?start=30&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content5 <- read_html("https://scholar.google.com/scholar?start=40&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content6 <- read_html("https://scholar.google.com/scholar?start=50&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content7 <- read_html("https://scholar.google.com/scholar?start=60&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content8 <- read_html("https://scholar.google.com/scholar?start=70&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content9 <- read_html("https://scholar.google.com/scholar?start=80&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
html_content10 <- read_html("https://scholar.google.com/scholar?start=90&hl=en&as_sdt=2005&sciodt=0,5&cites=6422621221779132632&scipsc=&inst=5746887945952177237")
bind_rows(
`1` = process_html(html_content1),
`2` = process_html(html_content2),
`3` = process_html(html_content3),
`4` = process_html(html_content4),
`5` = process_html(html_content5),
`6` = process_html(html_content6),
`7` = process_html(html_content7),
`8` = process_html(html_content8),
`9` = process_html(html_content9),
`10` = process_html(html_content10),
.id = "Page"
) %>%
write_csv(here::here("data/citation_samples/indonesian.csv"))read_csv(here::here("data/citation_samples/indonesian.csv")) %>%
DT::datatable() JUOJS vs non-JUOJS domains among citations:
read_csv(here::here("data/citation_samples/indonesian.csv")) %>%
count(`JUOJS Domain`, name = "Count")html_content1 <- read_html("https://scholar.google.com/scholar?cites=12229533173691059082&as_sdt=2005&sciodt=0,5&hl=en&inst=5746887945952177237")
html_content2 <- read_html("https://scholar.google.com/scholar?start=10&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content3 <- read_html("https://scholar.google.com/scholar?start=20&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content4 <- read_html("https://scholar.google.com/scholar?start=30&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content5 <- read_html("https://scholar.google.com/scholar?start=40&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content6 <- read_html("https://scholar.google.com/scholar?start=50&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content7 <- read_html("https://scholar.google.com/scholar?start=60&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content8 <- read_html("https://scholar.google.com/scholar?start=70&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content9 <- read_html("https://scholar.google.com/scholar?start=80&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
html_content10 <- read_html("https://scholar.google.com/scholar?start=90&hl=en&as_sdt=2005&sciodt=0,5&cites=12229533173691059082&scipsc=&inst=5746887945952177237")
bind_rows(
`1` = process_html(html_content1),
`2` = process_html(html_content2),
`3` = process_html(html_content3),
`4` = process_html(html_content4),
`5` = process_html(html_content5),
`6` = process_html(html_content6),
`7` = process_html(html_content7),
`8` = process_html(html_content8),
`9` = process_html(html_content9),
`10` = process_html(html_content10),
.id = "Page"
) %>%
write_csv(here::here("data/citation_samples/portuguese.csv"))read_csv(here::here("data/citation_samples/portuguese.csv")) %>%
DT::datatable() JUOJS vs non-JUOJS domains among citations:
read_csv(here::here("data/citation_samples/portuguese.csv")) %>%
count(`JUOJS Domain`, name = "Count")