Init

options(digit = 2)
library(pacman)
p_load(kirkegaard, rvest, lubridate, DT, rms, ggeffects)

Scrape

#journal list page
journals_page = read_html("https://www.mdpi.com/about/journals")

#journals table
journals = journals_page %>% html_table() %>% .[[1]] %>% df_legalize_names()
journals$RSS = NULL

#fix double names
journals$Journal_Name %<>% str_replace("\n+[\\w ]+", "")

#journal urls
journals$url = journals_page %>% 
  html_node(".journaltable") %>% 
  html_nodes("tr td") %>% 
  {
    .[seq(1, length(.), by = 7)]
  } %>% 
  html_nodes("a") %>% 
  html_attr("href")

#get abbrev
journals$abbrev = journals$url %>% str_replace("/journal/", "")

#do we have files already?
this_year = now() %>% year()

#skip if we have files already
files_this_year = dir(str_glue("data/{this_year}"), pattern = "html$")


if (length(files_this_year) < 20) {
  #make dir
  dir.create(str_glue("data/{this_year}"), recursive = T)
  
  #loop and download main and apc page
  for (i in seq_along_rows(journals)) {
    i_journal = journals$abbrev[i]
    message(i_journal)
    
    #main journal page
    read_html(str_glue("https://www.mdpi.com{journals$url[i]}")) %>% 
      as.character() %>% 
      write_lines(str_glue("data/{this_year}/{i_journal}.html"))
    
    #apc page
    read_html(str_glue("https://www.mdpi.com{journals$url[i]}/apc")) %>% 
      as.character() %>% 
      write_lines(str_glue("data/{this_year}/{i_journal}_apc.html"))
    
    #wait?
    #nah
    # Sys.sleep()
  }
  
}

Parse

#get APC from each journal
journals$APC = NA
for (i in seq_along_rows(journals)) {
  i_journal = journals$abbrev[i]
  
  #apc page
  i_apc = read_html(str_glue("data/{this_year}/{i_journal}_apc.html"))
  
  #find value
  i_apc_chf = i_apc %>% as.character() %>% str_match("(\\d+) CHF") %>% .[, 2]
  
  #is subsized? 
  i_subsidized = i_apc %>% as.character() %>% str_detect("fully subsidized")
  i_free = i_apc %>% as.character() %>% str_detect("free publication for well-prepared manuscript")
  
  if (i_subsidized | i_free) i_apc_chf = 0
  
  #save value
  journals$APC[i] = i_apc_chf %>% as.numeric()
  
  #wait?
  # Sys.sleep()
}

Table of journals

#table of data
journals %>% 
  arrange(APC) %>% 
  select(Journal_Name, APC, Launched, Total_Articles, everything()) %>% 
  DT::datatable()

Analysis

#correlations
journals %>% select(Launched, Total_Articles, APC) %>% wtd.cors()
##                  Launched Total_Articles        APC
## Launched        1.0000000     -0.2625725 -0.2523478
## Total_Articles -0.2625725      1.0000000  0.5841938
## APC            -0.2523478      0.5841938  1.0000000
#publications and price
GG_scatter(journals, "Launched", "APC", case_names = "abbrev")

GG_scatter(journals, "Total_Articles", "APC", case_names = "abbrev")

#log convert predictor for indirect logistic modeling
journals$Total_Articles_log = log10(journals$Total_Articles + 1)
mod1 = ols(APC ~ Total_Articles_log, data = journals)
mod1
## Frequencies of Missing Values Due to Each Variable
##                APC Total_Articles_log 
##                  2                  0 
## 
## Linear Regression Model
##  
##  ols(formula = APC ~ Total_Articles_log, data = journals)
##  
##  
##                   Model Likelihood     Discrimination    
##                      Ratio Test           Indexes        
##  Obs       215    LR chi2    206.46    R2       0.617    
##  sigma253.6886    d.f.            1    R2 adj   0.615    
##  d.f.      213    Pr(> chi2) 0.0000    g      349.202    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1042.39  -136.93   -28.86   167.91   799.52 
##  
##  
##                     Coef     S.E.    t     Pr(>|t|)
##  Intercept           76.3284 58.6988  1.30 0.1949  
##  Total_Articles_log 412.4180 22.2543 18.53 <0.0001 
## 
mod1 %>% 
  ggeffect("Total_Articles_log") %>% 
  plot() +
  geom_point(data = journals, mapping = aes(Total_Articles_log, APC))
## Warning: Removed 2 rows containing missing values (geom_point).