Init
options(digit = 2)
library(pacman)
p_load(kirkegaard, rvest, lubridate, DT, rms, ggeffects)
Scrape
#journal list page
journals_page = read_html("https://www.mdpi.com/about/journals")
#journals table
journals = journals_page %>% html_table() %>% .[[1]] %>% df_legalize_names()
journals$RSS = NULL
#fix double names
journals$Journal_Name %<>% str_replace("\n+[\\w ]+", "")
#journal urls
journals$url = journals_page %>%
html_node(".journaltable") %>%
html_nodes("tr td") %>%
{
.[seq(1, length(.), by = 7)]
} %>%
html_nodes("a") %>%
html_attr("href")
#get abbrev
journals$abbrev = journals$url %>% str_replace("/journal/", "")
#do we have files already?
this_year = now() %>% year()
#skip if we have files already
files_this_year = dir(str_glue("data/{this_year}"), pattern = "html$")
if (length(files_this_year) < 20) {
#make dir
dir.create(str_glue("data/{this_year}"), recursive = T)
#loop and download main and apc page
for (i in seq_along_rows(journals)) {
i_journal = journals$abbrev[i]
message(i_journal)
#main journal page
read_html(str_glue("https://www.mdpi.com{journals$url[i]}")) %>%
as.character() %>%
write_lines(str_glue("data/{this_year}/{i_journal}.html"))
#apc page
read_html(str_glue("https://www.mdpi.com{journals$url[i]}/apc")) %>%
as.character() %>%
write_lines(str_glue("data/{this_year}/{i_journal}_apc.html"))
#wait?
#nah
# Sys.sleep()
}
}
Parse
#get APC from each journal
journals$APC = NA
for (i in seq_along_rows(journals)) {
i_journal = journals$abbrev[i]
#apc page
i_apc = read_html(str_glue("data/{this_year}/{i_journal}_apc.html"))
#find value
i_apc_chf = i_apc %>% as.character() %>% str_match("(\\d+)Â CHF") %>% .[, 2]
#is subsized?
i_subsidized = i_apc %>% as.character() %>% str_detect("fully subsidized")
i_free = i_apc %>% as.character() %>% str_detect("free publication for well-prepared manuscript")
if (i_subsidized | i_free) i_apc_chf = 0
#save value
journals$APC[i] = i_apc_chf %>% as.numeric()
#wait?
# Sys.sleep()
}
Table of journals
#table of data
journals %>%
arrange(APC) %>%
select(Journal_Name, APC, Launched, Total_Articles, everything()) %>%
DT::datatable()
Analysis
#correlations
journals %>% select(Launched, Total_Articles, APC) %>% wtd.cors()
## Launched Total_Articles APC
## Launched 1.0000000 -0.2625725 -0.2523478
## Total_Articles -0.2625725 1.0000000 0.5841938
## APC -0.2523478 0.5841938 1.0000000
#publications and price
GG_scatter(journals, "Launched", "APC", case_names = "abbrev")

GG_scatter(journals, "Total_Articles", "APC", case_names = "abbrev")

#log convert predictor for indirect logistic modeling
journals$Total_Articles_log = log10(journals$Total_Articles + 1)
mod1 = ols(APC ~ Total_Articles_log, data = journals)
mod1
## Frequencies of Missing Values Due to Each Variable
## APC Total_Articles_log
## 2 0
##
## Linear Regression Model
##
## ols(formula = APC ~ Total_Articles_log, data = journals)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 215 LR chi2 206.46 R2 0.617
## sigma253.6886 d.f. 1 R2 adj 0.615
## d.f. 213 Pr(> chi2) 0.0000 g 349.202
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1042.39 -136.93 -28.86 167.91 799.52
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 76.3284 58.6988 1.30 0.1949
## Total_Articles_log 412.4180 22.2543 18.53 <0.0001
##
mod1 %>%
ggeffect("Total_Articles_log") %>%
plot() +
geom_point(data = journals, mapping = aes(Total_Articles_log, APC))
## Warning: Removed 2 rows containing missing values (geom_point).
