Abstract
Text analysis of AGILE conference papers - source code at https://github.com/nuest/reproducible-research-and-giscience.This document does not install the required R packages by default. You can run the script install.R to install all required dependencies on a new R installation, or use install.packages(..) to install missing R packages.
source("install.R")
The text analysis is based the R package tidytext from the tidyverse suite of packages and uses the dplyr grammar. Read the tidytext tutorial to learn about the used functions and concepts.
The plots and tables of survey data and evaluation use the packages ggplot2.
Required libraries and runtime environment description.
library("pdftools")
library("stringr")
library("knitr")
library("tibble")
library("tidytext")
library("purrr")
library("dplyr")
library("wordcloud")
library("RColorBrewer")
library("readr")
library("ggplot2")
library("rvest")
library("ggthemes")
library("grid")
library("gridBase")
library("gridExtra")
library("devtools")
library("rlang")
library("huxtable")
library("here")
library("httr")
library("googledrive")
library("SnowballC")
Seed is set for making word cloud generation reproducible.
set.seed(1)
data_path <- "all-manuscripts"
The data for the analysis is required in form of directories with PDF files of all conference papers and poster abstracts. Due to copyright of full papers, the full paper PDFs must be manually added to the respective directory. Short papers and poster abstracts are dowloaded automatically.
Add the PDFs to a directory called all-manuscripts this file with one subdirectoy per year:
list.files("all-manuscripts/")
## [1] "2018" "2019" "2019-test"
The following downloads of AGILE short papers are not executed by default.
dir.create(here::here(data_path, "2018"))
page <- read_html("https://agile-online.org/programme-2018/accepted-papers-and-posters-2018")
all_links <- page %>%
html_nodes(css = "a") %>%
html_attr("href") %>%
as.list()
drive_links <- all_links[str_detect(string = all_links, pattern = "drive.google")]
drive_links[sapply(drive_links, is.null)] <- NULL
drive_ids <- lapply(drive_links, as_id)
lapply(drive_ids, drive_download, overwrite = TRUE)
dir.create(here::here(data_path, "2019"))
page <- read_html("https://agile-online.org/conference-2019/programme-2019/accepted-papers-and-posters-2019")
all_links <- page %>%
html_nodes(css = "a") %>%
html_attr("href") %>%
as.list()
pdf_links <- all_links[str_detect(string = all_links, pattern = ".*Upload_your_PDF.*")]
pdf_links[sapply(pdf_links, is.null)] <- NULL
pdf_links <- paste0("https://agile-online.org", pdf_links)
for (link in pdf_links) {
download.file(url = link,
destfile = here::here(data_path,
"2019",
stringr::str_extract(link, "([^/]+$)")))
}
files_2018 <- dir(path = here::here(data_path, "2018"), pattern = ".pdf$", full.names = TRUE)
This analysis was created with the following 125 documents:
## [1] "/10.1007_2F978-3-319-78208-9_1.pdf"
## [2] "/10.1007_2F978-3-319-78208-9_10.pdf"
## [3] "/10.1007_2F978-3-319-78208-9_11.pdf"
## [4] "/10.1007_2F978-3-319-78208-9_12.pdf"
## [5] "/10.1007_2F978-3-319-78208-9_13.pdf"
## [6] "/10.1007_2F978-3-319-78208-9_14.pdf"
## [7] "/10.1007_2F978-3-319-78208-9_15.pdf"
## [8] "/10.1007_2F978-3-319-78208-9_16.pdf"
## [9] "/10.1007_2F978-3-319-78208-9_17.pdf"
## [10] "/10.1007_2F978-3-319-78208-9_18.pdf"
## [11] "/10.1007_2F978-3-319-78208-9_19.pdf"
## [12] "/10.1007_2F978-3-319-78208-9_2.pdf"
## [13] "/10.1007_2F978-3-319-78208-9_3.pdf"
## [14] "/10.1007_2F978-3-319-78208-9_4.pdf"
## [15] "/10.1007_2F978-3-319-78208-9_5.pdf"
## [16] "/10.1007_2F978-3-319-78208-9_6.pdf"
## [17] "/10.1007_2F978-3-319-78208-9_7.pdf"
## [18] "/10.1007_2F978-3-319-78208-9_8.pdf"
## [19] "/10.1007_2F978-3-319-78208-9_9.pdf"
## [20] "/101 AGILE_2018_opendata_municipality_final.pdf"
## [21] "/102 AGILE_2018_poster_revised.pdf"
## [22] "/103_paper.pdf"
## [23] "/104_paper.pdf"
## [24] "/105 105_short_paper.pdf"
## [25] "/106 A Pattern-based Framework for Designing Location-based Games.pdf"
## [26] "/107 ShortPaper ID 107.pdf"
## [27] "/108 AGILE_CameraReady2.pdf"
## [28] "/109 revised_final.pdf"
## [29] "/110 poster_110_agile_2018_distributed_generation_of_imagepyramids_revised.pdf"
## [30] "/111 AGILE_2018_paper_111.pdf"
## [31] "/112 AGILE 2018_Huang et al..pdf"
## [32] "/114HarrieKarstensHuang.pdf"
## [33] "/115 Agile_poster_115_Promoting_walking_cycling.pdf"
## [34] "/118M+ñs-ShortPaper.pdf"
## [35] "/120.pdf"
## [36] "/121 AGILE_2018_Short_Paper_ID_121_Pajarito.pdf"
## [37] "/122nDPointCloud_AGILE2018.pdf"
## [38] "/123A Platform for Coordinating Voluntary Helpers in Disaster Response.pdf"
## [39] "/124 Topological_Reconstruction_AGILE_2018 (Final).pdf"
## [40] "/125Kotavaara_at_al_AGILE_2018.pdf"
## [41] "/126 AGILE2018_SP_126.pdf"
## [42] "/127 AGILE_2018_poster 127.pdf"
## [43] "/128AGILE_2018_v1c_ShortPaper_ENACT_final.pdf"
## [44] "/129 AGILE_2018_Poster_City SImulation Lab_revised.pdf"
## [45] "/130Ledermann (2018) Towards Automatic Extraction of Cartographic Metadata from the Code of Online Maps.pdf"
## [46] "/131 Agile2018_Paper_TCH-JIN-FJO_Final_v2.pdf"
## [47] "/132 AGILE_2018_Metral_final_5.pdf"
## [48] "/134Hoda Allahbakhshi_AGILE_2018.pdf"
## [49] "/135 Short Paper ID- 135_20180425.pdf"
## [50] "/136 Poster 136-9_04_w_authors.pdf"
## [51] "/137 _AGILE_2018_v2.pdf"
## [52] "/139 Agile_HannahHaacke.pdf"
## [53] "/140AGILE2018_Yi-Min Chang Chien (revised).pdf"
## [54] "/142 Agile 2018 Alegams short_authors-final.pdf"
## [55] "/143AGILE_2018_Living_Textbook - 14 FEB 2018_after_review.pdf"
## [56] "/144 poster-144.pdf"
## [57] "/145 Short Paper ID 145-An improved European LULC map derived by data integration_correction.pdf"
## [58] "/146 Short paper ID 146.pdf"
## [59] "/147 AGILE_2018_SCBA_of_HV_OGD_final.pdf"
## [60] "/148 ShortPaper_148txt.pdf"
## [61] "/149 ShortPaper_ID_149_txt.pdf"
## [62] "/150 paper_150_AGILE_2018_V1.1.pdf"
## [63] "/151 AGILE_2018_Mai.pdf"
## [64] "/152 sstein_cedeus-research-sdi_v0.5-final.pdf"
## [65] "/153 AGILE_2018_NKayhko_revised_April10th.pdf"
## [66] "/155 Fina_AGILE_2018_Msilanga.pdf"
## [67] "/157 AGILE_2018_Short PaperTheodomirMugiranezaFinal.pdf"
## [68] "/160 JuhaszHochmair_AGILE2018_revised_final.pdf"
## [69] "/161 Citizense poster manuscript 2018 (final).pdf"
## [70] "/162 AGILE_2018_PaperAndPoster_Poster162_revised_20180413.pdf"
## [71] "/163 AGILE2018-Davidovic_Mooney_Stoimenov_camera_ready.pdf"
## [72] "/164.pdf"
## [73] "/165 Paper 165_Spatial Patterns for Crime Spots (revised).pdf"
## [74] "/166 AGILE_2018_final_djerriri.pdf"
## [75] "/167_v2.pdf"
## [76] "/168 AGILE_2018.pdf"
## [77] "/170 AGILE_18.pdf"
## [78] "/171 171-revised-final.pdf"
## [79] "/172 AGILE18 short paper submission final ZG PM.pdf"
## [80] "/173 short-paper-173.pdf"
## [81] "/41_AGILE_2018_Agung_Indrajit-rev.pdf"
## [82] "/48 paper 48.pdf"
## [83] "/49 AGILE_poster_Final.pdf"
## [84] "/51 Short_Paper_ID_51.pdf"
## [85] "/52 Spatial Vision Analysis of Non Spatial Data.pdf"
## [86] "/53 AGILE_2018_ShortPaperID53_Dane.pdf"
## [87] "/54 PosterSubmission_DingMa3.pdf"
## [88] "/55 Collaboration between Science, Practice, Citizens final.pdf"
## [89] "/57_Heinzlef-Coping with urban floods_a special decision-support system to improve resilience.pdf"
## [90] "/58 agileSEnviroPoster58.pdf"
## [91] "/59 ShortPaper59_revised.pdf"
## [92] "/62 Revised_AGILE_Poster_20180408.pdf"
## [93] "/63 Revised_Document_Westerholt.pdf"
## [94] "/64 short_paper_64.pdf"
## [95] "/65 Paper_ID_65_AGILE2018_Landuse_Characterisation.pdf"
## [96] "/66 Kernel Density Estimation (KDE) vs. Hot-Spot Analysis - Detecting Criminal Hot Spots in the City of San Francisco_UPDATE.pdf"
## [97] "/67 AGILE_2018_Villette_Purves.pdf"
## [98] "/68 68 Brox et al. AGILE_short paper brain drain after review final 3.4.18.pdf"
## [99] "/69 AGILE_2018_Geocoding_Social_Media_Messages.pdf"
## [100] "/70 AGILE_Paper_Final_Lund_2018_Ongaya_Kizito.pdf"
## [101] "/71_finalized.pdf"
## [102] "/72 short_paper_72.pdf"
## [103] "/73 AGILE_2018_shortpaper_revised_for_Libreoffice.pdf"
## [104] "/74 Training SegNet for Cropland Classification of High Resolution Remote Sensing Images.pdf"
## [105] "/75 Simulating multiple land use changes by incorporating deep belief network into cellular automata, a case study in BEIJING-TIANJIN-HEBEI region, China.pdf"
## [106] "/76 AGILE_2018_short_review.pdf"
## [107] "/77 Poster 77 - SharingBuidingInformation version - 2018-03-27.pdf"
## [108] "/80 RSanya_AGILE2018_shortPaperID80.pdf"
## [109] "/81 AGILE_2018_81 revision.pdf"
## [110] "/82 seusn_poster_agile2018-poster82.pdf"
## [111] "/83 AGILE_20180405_PerOla.pdf"
## [112] "/84_Agile_Yuanxuan_Submission_V3.pdf"
## [113] "/86 richter etal agile2018_final.pdf"
## [114] "/87 Short Paper ID.pdf"
## [115] "/89 AGILE_2018_Rivised Paper.pdf"
## [116] "/90 AGILE_2018_poster90.pdf"
## [117] "/91 POSTER 91_Spatial estimation _xiaoqian LIU.pdf"
## [118] "/93 Agile_paper_sub.pdf"
## [119] "/94 Short_Paper_ID_94_AGILE_2018_UAS_Mission_Support_Final_2018_04_10.pdf"
## [120] "/96 Poster 96.pdf"
## [121] "/98 agile_2018_short_paper_final.pdf"
## [122] "/99 TopicWave_poster99_revision_final.pdf"
## [123] "/Petter_AGILE_2018_Education_Final.pdf"
## [124] "/Petter2_AGILE_2018_Final_Paper_Uganda.pdf"
## [125] "/Yanzi_Poster paper_AGILE_0605_1.pdf"
Count the types of submissions:
full_papers_2018 <- length(str_match(files_2018, "10.100")[!is.na(str_match(files_2018, "10.100"))])
other_papers_2018 <- length(str_match(files_2018, "10.100")[is.na(str_match(files_2018, "10.100"))])
There are 19 full papers and 106 short papers/posters.
Read the data from PDFs and preprocess to create a tidy data structure without stop words:
my_stop_words <- tibble(
word = c(
"et",
"al",
"fig",
"e.g",
"i.e",
"http",
"ing",
"pp",
"figure",
"table",
"based",
"lund", # location of conference 2018
"https"
),
lexicon = "agile"
)
all_stop_words <- stop_words %>%
bind_rows(my_stop_words)
texts <- lapply(files_2018, pdf_text)
texts <- unlist(lapply(texts, str_c, collapse = TRUE))
infos <- lapply(files_2018, pdf_info)
make_id <- function(files) {
str_extract(files, "([^/]+$)")
}
tidy_texts_2018 <- tibble(id = make_id(files_2018),
file = files_2018,
text = texts,
pages = map_chr(infos, function(info) {info$pages}))
papers_words <- tidy_texts_2018 %>%
select(file, text) %>%
unnest_tokens(word, text)
suppressWarnings({
no_numbers <- papers_words %>%
filter(is.na(as.numeric(word)))
})
no_stop_words_2018 <- no_numbers %>%
anti_join(all_stop_words, by = "word") %>%
mutate(id = make_id(file))
# https://github.com/juliasilge/tidytext/issues/17
no_stop_stems_2018 <- no_stop_words_2018 %>%
mutate(word_stem = wordStem(word))
About 49 % of the words are considered stop words. There are 17931 unique word stems of 23292 words.
How many non-stop words does each document have?
no_stop_words_2018 %>%
group_by(id) %>%
summarise(words = n()) %>%
arrange(desc(words))
| id | words |
| 10.1007_2F978-3-319-78208-9_2.pdf | 4813 |
| 10.1007_2F978-3-319-78208-9_10.pdf | 4530 |
| 10.1007_2F978-3-319-78208-9_7.pdf | 4343 |
| 10.1007_2F978-3-319-78208-9_9.pdf | 4246 |
| 10.1007_2F978-3-319-78208-9_8.pdf | 3632 |
| 10.1007_2F978-3-319-78208-9_15.pdf | 3586 |
| 10.1007_2F978-3-319-78208-9_13.pdf | 3490 |
| 10.1007_2F978-3-319-78208-9_16.pdf | 3430 |
| 10.1007_2F978-3-319-78208-9_12.pdf | 3372 |
| 10.1007_2F978-3-319-78208-9_14.pdf | 3337 |
| 10.1007_2F978-3-319-78208-9_1.pdf | 3107 |
| 10.1007_2F978-3-319-78208-9_17.pdf | 3015 |
| 10.1007_2F978-3-319-78208-9_19.pdf | 3008 |
| 10.1007_2F978-3-319-78208-9_11.pdf | 2960 |
| 10.1007_2F978-3-319-78208-9_3.pdf | 2778 |
| 10.1007_2F978-3-319-78208-9_4.pdf | 2736 |
| 10.1007_2F978-3-319-78208-9_18.pdf | 2641 |
| 10.1007_2F978-3-319-78208-9_6.pdf | 2598 |
| 10.1007_2F978-3-319-78208-9_5.pdf | 2571 |
| 121 AGILE_2018_Short_Paper_ID_121_Pajarito.pdf | 2413 |
| 152 sstein_cedeus-research-sdi_v0.5-final.pdf | 2382 |
| 41_AGILE_2018_Agung_Indrajit-rev.pdf | 2286 |
| 122nDPointCloud_AGILE2018.pdf | 2274 |
| 128AGILE_2018_v1c_ShortPaper_ENACT_final.pdf | 2218 |
| 171 171-revised-final.pdf | 2180 |
| 93 Agile_paper_sub.pdf | 2049 |
| 139 Agile_HannahHaacke.pdf | 1951 |
| 172 AGILE18 short paper submission final ZG PM.pdf | 1934 |
| 168 AGILE_2018.pdf | 1888 |
| 70 AGILE_Paper_Final_Lund_2018_Ongaya_Kizito.pdf | 1876 |
| 126 AGILE2018_SP_126.pdf | 1874 |
| 104_paper.pdf | 1873 |
| 125Kotavaara_at_al_AGILE_2018.pdf | 1867 |
| 145 Short Paper ID 145-An improved European LULC map derived by data integration_correction.pdf | 1834 |
| 134Hoda Allahbakhshi_AGILE_2018.pdf | 1817 |
| 106 A Pattern-based Framework for Designing Location-based Games.pdf | 1815 |
| 147 AGILE_2018_SCBA_of_HV_OGD_final.pdf | 1814 |
| 151 AGILE_2018_Mai.pdf | 1814 |
| 89 AGILE_2018_Rivised Paper.pdf | 1796 |
| 112 AGILE 2018_Huang et al..pdf | 1790 |
| 173 short-paper-173.pdf | 1757 |
| 170 AGILE_18.pdf | 1755 |
| 53 AGILE_2018_ShortPaperID53_Dane.pdf | 1740 |
| 65 Paper_ID_65_AGILE2018_Landuse_Characterisation.pdf | 1737 |
| 107 ShortPaper ID 107.pdf | 1706 |
| 75 Simulating multiple land use changes by incorporating deep belief network into cellular automata, a case study in BEIJING-TIANJIN-HEBEI region, China.pdf | 1706 |
| 69 AGILE_2018_Geocoding_Social_Media_Messages.pdf | 1697 |
| 109 revised_final.pdf | 1696 |
| 131 Agile2018_Paper_TCH-JIN-FJO_Final_v2.pdf | 1687 |
| 149 ShortPaper_ID_149_txt.pdf | 1670 |
| 160 JuhaszHochmair_AGILE2018_revised_final.pdf | 1670 |
| 76 AGILE_2018_short_review.pdf | 1668 |
| 130Ledermann (2018) Towards Automatic Extraction of Cartographic Metadata from the Code of Online Maps.pdf | 1663 |
| 63 Revised_Document_Westerholt.pdf | 1658 |
| 123A Platform for Coordinating Voluntary Helpers in Disaster Response.pdf | 1650 |
| 74 Training SegNet for Cropland Classification of High Resolution Remote Sensing Images.pdf | 1638 |
| 108 AGILE_CameraReady2.pdf | 1636 |
| 132 AGILE_2018_Metral_final_5.pdf | 1627 |
| 71_finalized.pdf | 1617 |
| 124 Topological_Reconstruction_AGILE_2018 (Final).pdf | 1605 |
| 59 ShortPaper59_revised.pdf | 1600 |
| 66 Kernel Density Estimation (KDE) vs. Hot-Spot Analysis - Detecting Criminal Hot Spots in the City of San Francisco_UPDATE.pdf | 1596 |
| 135 Short Paper ID- 135_20180425.pdf | 1582 |
| 165 Paper 165_Spatial Patterns for Crime Spots (revised).pdf | 1570 |
| 80 RSanya_AGILE2018_shortPaperID80.pdf | 1567 |
| 142 Agile 2018 Alegams short_authors-final.pdf | 1560 |
| 150 paper_150_AGILE_2018_V1.1.pdf | 1560 |
| 51 Short_Paper_ID_51.pdf | 1539 |
| 146 Short paper ID 146.pdf | 1531 |
| 72 short_paper_72.pdf | 1531 |
| 118M+ñs-ShortPaper.pdf | 1519 |
| 48 paper 48.pdf | 1516 |
| 163 AGILE2018-Davidovic_Mooney_Stoimenov_camera_ready.pdf | 1512 |
| 94 Short_Paper_ID_94_AGILE_2018_UAS_Mission_Support_Final_2018_04_10.pdf | 1505 |
| 64 short_paper_64.pdf | 1502 |
| 55 Collaboration between Science, Practice, Citizens final.pdf | 1462 |
| 86 richter etal agile2018_final.pdf | 1455 |
| 101 AGILE_2018_opendata_municipality_final.pdf | 1449 |
| 68 68 Brox et al. AGILE_short paper brain drain after review final 3.4.18.pdf | 1448 |
| 111 AGILE_2018_paper_111.pdf | 1436 |
| 103_paper.pdf | 1422 |
| 143AGILE_2018_Living_Textbook - 14 FEB 2018_after_review.pdf | 1408 |
| 57_Heinzlef-Coping with urban floods_a special decision-support system to improve resilience.pdf | 1404 |
| 137 _AGILE_2018_v2.pdf | 1391 |
| 87 Short Paper ID.pdf | 1368 |
| 98 agile_2018_short_paper_final.pdf | 1364 |
| Petter_AGILE_2018_Education_Final.pdf | 1362 |
| 67 AGILE_2018_Villette_Purves.pdf | 1327 |
| 114HarrieKarstensHuang.pdf | 1311 |
| 52 Spatial Vision Analysis of Non Spatial Data.pdf | 1292 |
| 140AGILE2018_Yi-Min Chang Chien (revised).pdf | 1280 |
| Petter2_AGILE_2018_Final_Paper_Uganda.pdf | 1280 |
| 84_Agile_Yuanxuan_Submission_V3.pdf | 1277 |
| 73 AGILE_2018_shortpaper_revised_for_Libreoffice.pdf | 1202 |
| 157 AGILE_2018_Short PaperTheodomirMugiranezaFinal.pdf | 1177 |
| 153 AGILE_2018_NKayhko_revised_April10th.pdf | 1159 |
| 82 seusn_poster_agile2018-poster82.pdf | 1113 |
| 166 AGILE_2018_final_djerriri.pdf | 1023 |
| 162 AGILE_2018_PaperAndPoster_Poster162_revised_20180413.pdf | 975 |
| 81 AGILE_2018_81 revision.pdf | 941 |
| 58 agileSEnviroPoster58.pdf | 931 |
| 148 ShortPaper_148txt.pdf | 921 |
| 155 Fina_AGILE_2018_Msilanga.pdf | 913 |
| 99 TopicWave_poster99_revision_final.pdf | 906 |
| 62 Revised_AGILE_Poster_20180408.pdf | 872 |
| 105 105_short_paper.pdf | 809 |
| 120.pdf | 792 |
| 136 Poster 136-9_04_w_authors.pdf | 750 |
| Yanzi_Poster paper_AGILE_0605_1.pdf | 750 |
| 115 Agile_poster_115_Promoting_walking_cycling.pdf | 716 |
| 161 Citizense poster manuscript 2018 (final).pdf | 710 |
| 54 PosterSubmission_DingMa3.pdf | 709 |
| 164.pdf | 704 |
| 127 AGILE_2018_poster 127.pdf | 702 |
| 102 AGILE_2018_poster_revised.pdf | 701 |
| 90 AGILE_2018_poster90.pdf | 672 |
| 144 poster-144.pdf | 666 |
| 91 POSTER 91_Spatial estimation _xiaoqian LIU.pdf | 621 |
| 77 Poster 77 - SharingBuidingInformation version - 2018-03-27.pdf | 611 |
| 83 AGILE_20180405_PerOla.pdf | 597 |
| 167_v2.pdf | 567 |
| 129 AGILE_2018_Poster_City SImulation Lab_revised.pdf | 561 |
| 110 poster_110_agile_2018_distributed_generation_of_imagepyramids_revised.pdf | 537 |
| 96 Poster 96.pdf | 519 |
| 49 AGILE_poster_Final.pdf | 474 |
How often do the following terms on reproducible research appear in each paper?
The detection matches full words using regex option \b.
reproduc.*, reproducibility, reproducible, reproduce, reproduction)replicat.*, i.e. replication, replicate)repeatab.*, i.e. repeatability, repeatable)algorithm.*, i.e. algorithms, algorithmic)process.*, i.e. processing, processes, preprocessing)data.*, i.e. dataset(s), database(s))tidy_texts_2018_lower <- str_to_lower(tidy_texts_2018$text)
word_counts <- tibble(
id = tidy_texts_2018$id,
`reproduc..` = str_count(tidy_texts_2018_lower, "\\breproduc.*\\b"),
`replic..` = str_count(tidy_texts_2018_lower, "\\breplicat.*\\b"),
`repeatab..` = str_count(tidy_texts_2018_lower, "\\brepeatab.*\\b"),
`code` = str_count(tidy_texts_2018_lower, "(\\bcode\\b|\\bscript.*\\b|\\bpseudo\ code\\b)"),
`software` = str_count(tidy_texts_2018_lower, "\\bsoftware\\b"),
`algorithm(s)` = str_count(tidy_texts_2018_lower, "\\balgorithm.*\\b"),
`(pre)process..` = str_count(tidy_texts_2018_lower, "(\\bprocess.*\\b|\\bpreprocess.*\\b|\\bpre-process.*\\b)"),
`data.*` = str_count(tidy_texts_2018_lower, "\\bdata.*\\b"),
`result(s)` = str_count(tidy_texts_2018_lower, "\\bresults?\\b"),
`repository/ies` = str_count(tidy_texts_2018_lower, "\\brepositor(y|ies)\\b")
) %>%
mutate(all = rowSums(.[-1]))
word_counts_sums_total_2018 <- word_counts %>%
summarise_if(is.numeric, funs(sum)) %>%
add_column(id = "Total", .before = 0)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
rbind(word_counts, word_counts_sums_total_2018)
| id | reproduc.. | replic.. | repeatab.. | code | software | algorithm(s) | (pre)process.. | data.* | result(s) | repository/ies | all |
| 10.1007_2F978-3-319-78208-9_1.pdf | 0 | 0 | 0 | 0 | 0 | 8 | 7 | 66 | 41 | 0 | 122 |
| 10.1007_2F978-3-319-78208-9_10.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 11 | 6 | 13 | 0 | 30 |
| 10.1007_2F978-3-319-78208-9_11.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 8 | 32 | 11 | 0 | 52 |
| 10.1007_2F978-3-319-78208-9_12.pdf | 0 | 0 | 0 | 0 | 0 | 42 | 11 | 48 | 5 | 0 | 106 |
| 10.1007_2F978-3-319-78208-9_13.pdf | 0 | 0 | 0 | 1 | 1 | 51 | 27 | 40 | 30 | 1 | 151 |
| 10.1007_2F978-3-319-78208-9_14.pdf | 0 | 0 | 0 | 0 | 0 | 17 | 4 | 34 | 5 | 0 | 60 |
| 10.1007_2F978-3-319-78208-9_15.pdf | 0 | 0 | 0 | 0 | 0 | 8 | 41 | 66 | 16 | 0 | 131 |
| 10.1007_2F978-3-319-78208-9_16.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 8 | 27 | 3 | 0 | 40 |
| 10.1007_2F978-3-319-78208-9_17.pdf | 0 | 0 | 0 | 0 | 1 | 6 | 5 | 95 | 8 | 0 | 115 |
| 10.1007_2F978-3-319-78208-9_18.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 45 | 8 | 0 | 56 |
| 10.1007_2F978-3-319-78208-9_19.pdf | 0 | 0 | 0 | 1 | 1 | 0 | 3 | 27 | 24 | 0 | 56 |
| 10.1007_2F978-3-319-78208-9_2.pdf | 0 | 0 | 0 | 7 | 12 | 3 | 3 | 53 | 26 | 0 | 104 |
| 10.1007_2F978-3-319-78208-9_3.pdf | 0 | 0 | 0 | 0 | 2 | 0 | 2 | 28 | 10 | 0 | 42 |
| 10.1007_2F978-3-319-78208-9_4.pdf | 0 | 0 | 0 | 0 | 0 | 11 | 11 | 35 | 20 | 0 | 77 |
| 10.1007_2F978-3-319-78208-9_5.pdf | 0 | 0 | 0 | 0 | 3 | 0 | 5 | 13 | 6 | 0 | 27 |
| 10.1007_2F978-3-319-78208-9_6.pdf | 0 | 0 | 0 | 0 | 3 | 7 | 2 | 28 | 6 | 2 | 48 |
| 10.1007_2F978-3-319-78208-9_7.pdf | 0 | 0 | 0 | 1 | 46 | 1 | 3 | 77 | 12 | 4 | 144 |
| 10.1007_2F978-3-319-78208-9_8.pdf | 0 | 1 | 0 | 0 | 0 | 0 | 4 | 62 | 2 | 0 | 69 |
| 10.1007_2F978-3-319-78208-9_9.pdf | 0 | 0 | 0 | 7 | 12 | 0 | 13 | 122 | 4 | 0 | 158 |
| 101 AGILE_2018_opendata_municipality_final.pdf | 0 | 0 | 0 | 0 | 2 | 0 | 4 | 59 | 0 | 0 | 65 |
| 102 AGILE_2018_poster_revised.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 2 |
| 103_paper.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 60 | 2 | 0 | 62 |
| 104_paper.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 35 | 1 | 6 | 0 | 43 |
| 105 105_short_paper.pdf | 1 | 0 | 0 | 1 | 3 | 0 | 0 | 7 | 5 | 0 | 17 |
| 106 A Pattern-based Framework for Designing Location-based Games.pdf | 0 | 0 | 0 | 3 | 1 | 0 | 4 | 7 | 0 | 0 | 15 |
| 107 ShortPaper ID 107.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2 | 3 | 0 | 10 |
| 108 AGILE_CameraReady2.pdf | 0 | 0 | 0 | 0 | 1 | 38 | 1 | 5 | 9 | 0 | 54 |
| 109 revised_final.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 14 | 6 | 5 | 0 | 26 |
| 110 poster_110_agile_2018_distributed_generation_of_imagepyramids_revised.pdf | 0 | 0 | 0 | 0 | 5 | 2 | 11 | 10 | 1 | 0 | 29 |
| 111 AGILE_2018_paper_111.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | 6 | 0 | 12 |
| 112 AGILE 2018_Huang et al..pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 93 | 1 | 0 | 95 |
| 114HarrieKarstensHuang.pdf | 0 | 0 | 0 | 3 | 0 | 0 | 4 | 18 | 5 | 0 | 30 |
| 115 Agile_poster_115_Promoting_walking_cycling.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 23 | 0 | 0 | 23 |
| 118M+ñs-ShortPaper.pdf | 0 | 0 | 0 | 6 | 1 | 0 | 7 | 62 | 1 | 0 | 77 |
| 120.pdf | 1 | 0 | 0 | 0 | 0 | 3 | 1 | 30 | 7 | 0 | 42 |
| 121 AGILE_2018_Short_Paper_ID_121_Pajarito.pdf | 0 | 2 | 0 | 1 | 0 | 0 | 3 | 46 | 3 | 0 | 55 |
| 122nDPointCloud_AGILE2018.pdf | 0 | 0 | 0 | 1 | 4 | 0 | 22 | 79 | 4 | 0 | 110 |
| 123A Platform for Coordinating Voluntary Helpers in Disaster Response.pdf | 0 | 0 | 0 | 0 | 5 | 3 | 18 | 4 | 5 | 0 | 35 |
| 124 Topological_Reconstruction_AGILE_2018 (Final).pdf | 0 | 0 | 0 | 0 | 3 | 10 | 13 | 28 | 2 | 0 | 56 |
| 125Kotavaara_at_al_AGILE_2018.pdf | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 31 | 7 | 0 | 40 |
| 126 AGILE2018_SP_126.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 29 | 24 | 0 | 56 |
| 127 AGILE_2018_poster 127.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 4 | 4 | 0 | 11 |
| 128AGILE_2018_v1c_ShortPaper_ENACT_final.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 61 | 6 | 0 | 68 |
| 129 AGILE_2018_Poster_City SImulation Lab_revised.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 6 |
| 130Ledermann (2018) Towards Automatic Extraction of Cartographic Metadata from the Code of Online Maps.pdf | 0 | 1 | 0 | 39 | 7 | 3 | 8 | 29 | 1 | 0 | 88 |
| 131 Agile2018_Paper_TCH-JIN-FJO_Final_v2.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 25 | 7 | 3 | 0 | 35 |
| 132 AGILE_2018_Metral_final_5.pdf | 0 | 0 | 0 | 1 | 0 | 2 | 18 | 73 | 0 | 0 | 94 |
| 134Hoda Allahbakhshi_AGILE_2018.pdf | 1 | 0 | 0 | 0 | 0 | 5 | 0 | 34 | 2 | 0 | 42 |
| 135 Short Paper ID- 135_20180425.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 54 | 1 | 0 | 56 |
| 136 Poster 136-9_04_w_authors.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 7 | 3 | 0 | 13 |
| 137 _AGILE_2018_v2.pdf | 0 | 0 | 0 | 0 | 3 | 0 | 3 | 7 | 7 | 0 | 20 |
| 139 Agile_HannahHaacke.pdf | 2 | 0 | 0 | 0 | 0 | 2 | 2 | 44 | 22 | 0 | 72 |
| 140AGILE2018_Yi-Min Chang Chien (revised).pdf | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 17 | 11 | 0 | 29 |
| 142 Agile 2018 Alegams short_authors-final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 8 | 8 | 0 | 18 |
| 143AGILE_2018_Living_Textbook - 14 FEB 2018_after_review.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 2 | 5 | 0 | 11 |
| 144 poster-144.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 3 |
| 145 Short Paper ID 145-An improved European LULC map derived by data integration_correction.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 48 | 7 | 0 | 59 |
| 146 Short paper ID 146.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 6 | 18 | 5 | 0 | 30 |
| 147 AGILE_2018_SCBA_of_HV_OGD_final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 152 | 1 | 0 | 156 |
| 148 ShortPaper_148txt.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 3 | 0 | 7 |
| 149 ShortPaper_ID_149_txt.pdf | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8 | 0 | 10 |
| 150 paper_150_AGILE_2018_V1.1.pdf | 0 | 0 | 0 | 1 | 0 | 17 | 2 | 17 | 13 | 0 | 50 |
| 151 AGILE_2018_Mai.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 3 | 21 | 25 | 1 | 53 |
| 152 sstein_cedeus-research-sdi_v0.5-final.pdf | 3 | 0 | 0 | 0 | 27 | 0 | 2 | 110 | 13 | 1 | 156 |
| 153 AGILE_2018_NKayhko_revised_April10th.pdf | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 15 | 1 | 2 | 23 |
| 155 Fina_AGILE_2018_Msilanga.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 13 | 2 | 0 | 18 |
| 157 AGILE_2018_Short PaperTheodomirMugiranezaFinal.pdf | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 14 | 5 | 0 | 22 |
| 160 JuhaszHochmair_AGILE2018_revised_final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 11 | 11 | 0 | 23 |
| 161 Citizense poster manuscript 2018 (final).pdf | 0 | 1 | 0 | 0 | 0 | 0 | 6 | 24 | 2 | 0 | 33 |
| 162 AGILE_2018_PaperAndPoster_Poster162_revised_20180413.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 8 | 8 | 0 | 17 |
| 163 AGILE2018-Davidovic_Mooney_Stoimenov_camera_ready.pdf | 0 | 1 | 0 | 2 | 1 | 0 | 11 | 30 | 4 | 0 | 49 |
| 164.pdf | 0 | 0 | 0 | 0 | 0 | 27 | 0 | 0 | 7 | 0 | 34 |
| 165 Paper 165_Spatial Patterns for Crime Spots (revised).pdf | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 10 | 11 | 0 | 24 |
| 166 AGILE_2018_final_djerriri.pdf | 0 | 0 | 0 | 0 | 2 | 18 | 7 | 24 | 3 | 0 | 54 |
| 167_v2.pdf | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 9 | 0 | 0 | 14 |
| 168 AGILE_2018.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 3 | 4 | 0 | 11 |
| 170 AGILE_18.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 61 | 3 | 0 | 72 |
| 171 171-revised-final.pdf | 0 | 0 | 0 | 0 | 0 | 4 | 2 | 22 | 10 | 0 | 38 |
| 172 AGILE18 short paper submission final ZG PM.pdf | 0 | 4 | 0 | 0 | 0 | 0 | 4 | 52 | 20 | 0 | 80 |
| 173 short-paper-173.pdf | 10 | 1 | 0 | 3 | 1 | 1 | 2 | 16 | 2 | 0 | 36 |
| 41_AGILE_2018_Agung_Indrajit-rev.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 9 | 89 | 0 | 0 | 99 |
| 48 paper 48.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | 11 | 0 | 45 |
| 49 AGILE_poster_Final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 1 | 0 | 7 |
| 51 Short_Paper_ID_51.pdf | 0 | 0 | 0 | 0 | 2 | 10 | 3 | 31 | 0 | 0 | 46 |
| 52 Spatial Vision Analysis of Non Spatial Data.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 40 | 13 | 0 | 60 |
| 53 AGILE_2018_ShortPaperID53_Dane.pdf | 0 | 0 | 0 | 0 | 2 | 0 | 3 | 32 | 8 | 0 | 45 |
| 54 PosterSubmission_DingMa3.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 12 | 0 | 0 | 13 |
| 55 Collaboration between Science, Practice, Citizens final.pdf | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 17 | 4 | 0 | 28 |
| 57_Heinzlef-Coping with urban floods_a special decision-support system to improve resilience.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 5 | 0 | 12 |
| 58 agileSEnviroPoster58.pdf | 0 | 0 | 0 | 0 | 4 | 1 | 3 | 14 | 2 | 0 | 24 |
| 59 ShortPaper59_revised.pdf | 0 | 0 | 0 | 0 | 0 | 11 | 2 | 50 | 0 | 0 | 63 |
| 62 Revised_AGILE_Poster_20180408.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | 2 | 0 | 13 |
| 63 Revised_Document_Westerholt.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 18 | 15 | 0 | 43 |
| 64 short_paper_64.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 3 | 0 | 9 |
| 65 Paper_ID_65_AGILE2018_Landuse_Characterisation.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 26 | 6 | 1 | 35 |
| 66 Kernel Density Estimation (KDE) vs. Hot-Spot Analysis - Detecting Criminal Hot Spots in the City of San Francisco_UPDATE.pdf | 0 | 0 | 0 | 0 | 3 | 1 | 5 | 26 | 11 | 0 | 46 |
| 67 AGILE_2018_Villette_Purves.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 13 | 7 | 0 | 24 |
| 68 68 Brox et al. AGILE_short paper brain drain after review final 3.4.18.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 11 | 0 | 15 |
| 69 AGILE_2018_Geocoding_Social_Media_Messages.pdf | 0 | 0 | 0 | 0 | 1 | 2 | 12 | 18 | 11 | 0 | 44 |
| 70 AGILE_Paper_Final_Lund_2018_Ongaya_Kizito.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 12 | 2 | 0 | 15 |
| 71_finalized.pdf | 1 | 0 | 0 | 4 | 2 | 1 | 1 | 91 | 1 | 0 | 101 |
| 72 short_paper_72.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 18 | 7 | 0 | 29 |
| 73 AGILE_2018_shortpaper_revised_for_Libreoffice.pdf | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 5 | 0 | 10 |
| 74 Training SegNet for Cropland Classification of High Resolution Remote Sensing Images.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 17 | 0 | 36 |
| 75 Simulating multiple land use changes by incorporating deep belief network into cellular automata, a case study in BEIJING-TIANJIN-HEBEI region, China.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 10 | 25 | 6 | 0 | 44 |
| 76 AGILE_2018_short_review.pdf | 0 | 0 | 0 | 0 | 5 | 0 | 19 | 86 | 3 | 0 | 113 |
| 77 Poster 77 - SharingBuidingInformation version - 2018-03-27.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 27 | 2 | 0 | 38 |
| 80 RSanya_AGILE2018_shortPaperID80.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 40 | 6 | 0 | 50 |
| 81 AGILE_2018_81 revision.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 4 |
| 82 seusn_poster_agile2018-poster82.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 4 |
| 83 AGILE_20180405_PerOla.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 3 | 1 | 0 | 6 |
| 84_Agile_Yuanxuan_Submission_V3.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16 | 8 | 0 | 25 |
| 86 richter etal agile2018_final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | 8 | 0 | 14 |
| 87 Short Paper ID.pdf | 0 | 0 | 0 | 0 | 2 | 0 | 12 | 12 | 12 | 0 | 38 |
| 89 AGILE_2018_Rivised Paper.pdf | 0 | 0 | 0 | 0 | 7 | 9 | 13 | 56 | 1 | 5 | 91 |
| 90 AGILE_2018_poster90.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 91 POSTER 91_Spatial estimation _xiaoqian LIU.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 20 | 4 | 0 | 25 |
| 93 Agile_paper_sub.pdf | 0 | 0 | 0 | 2 | 0 | 0 | 8 | 36 | 7 | 0 | 53 |
| 94 Short_Paper_ID_94_AGILE_2018_UAS_Mission_Support_Final_2018_04_10.pdf | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 24 | 3 | 0 | 30 |
| 96 Poster 96.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 3 |
| 98 agile_2018_short_paper_final.pdf | 0 | 1 | 0 | 0 | 0 | 19 | 6 | 18 | 16 | 0 | 60 |
| 99 TopicWave_poster99_revision_final.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 13 | 0 | 0 | 16 |
| Petter_AGILE_2018_Education_Final.pdf | 0 | 0 | 0 | 0 | 9 | 0 | 2 | 1 | 3 | 0 | 15 |
| Petter2_AGILE_2018_Final_Paper_Uganda.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 24 | 6 | 0 | 31 |
| Yanzi_Poster paper_AGILE_0605_1.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 11 | 4 | 4 | 0 | 21 |
| Total | 20 | 12 | 0 | 90 | 207 | 377 | 635 | 3536 | 827 | 17 | 5.72e+03 |
What are top used words (not stems)?
countPapersUsingWord <- function(the_word) {
sapply(the_word, function(w) {
no_stop_words_2018 %>%
filter(word == w) %>%
group_by(id) %>%
count %>%
nrow
})
}
top_words_2018 <- no_stop_words_2018 %>%
group_by(word) %>%
tally %>%
arrange(desc(n)) %>%
head(20) %>%
mutate(`# papers` = countPapersUsingWord(word)) %>%
add_column(place = c(1:nrow(.)), .before = 0)
top_words_2018
| place | word | n | # papers |
| 1 | data | 3303 | 122 |
| 2 | spatial | 1393 | 110 |
| 3 | information | 1199 | 118 |
| 4 | analysis | 860 | 110 |
| 5 | model | 807 | 89 |
| 6 | time | 779 | 95 |
| 7 | map | 757 | 86 |
| 8 | study | 697 | 96 |
| 9 | results | 661 | 106 |
| 10 | urban | 552 | 72 |
| 11 | research | 540 | 111 |
| 12 | system | 505 | 75 |
| 13 | land | 503 | 51 |
| 14 | models | 487 | 73 |
| 15 | location | 482 | 76 |
| 16 | user | 468 | 57 |
| 17 | users | 463 | 62 |
| 18 | city | 440 | 59 |
| 19 | gis | 438 | 78 |
| 20 | true | 428 | 112 |
What are the top word stems?
countPapersUsingStem <- function(the_stem) {
sapply(the_stem, function(s) {
no_stop_stems_2018 %>%
filter(word_stem == s) %>%
group_by(id) %>%
count %>%
nrow
})
}
top_stems_2018 <- no_stop_stems_2018 %>%
group_by(word_stem) %>%
tally %>%
arrange(desc(n)) %>%
head(20) %>%
mutate(`# papers` = countPapersUsingStem(word_stem)) %>%
add_column(place = c(1:nrow(.)), .before = 0)
top_stems_2018
| place | word_stem | n | # papers |
| 1 | data | 3303 | 122 |
| 2 | model | 1567 | 104 |
| 3 | spatial | 1541 | 110 |
| 4 | map | 1389 | 104 |
| 5 | inform | 1254 | 119 |
| 6 | studi | 940 | 108 |
| 7 | result | 939 | 116 |
| 8 | user | 931 | 72 |
| 9 | time | 861 | 99 |
| 10 | analysi | 860 | 110 |
| 11 | system | 850 | 97 |
| 12 | locat | 834 | 98 |
| 13 | method | 742 | 103 |
| 14 | develop | 692 | 110 |
| 15 | research | 668 | 113 |
| 16 | geograph | 640 | 98 |
| 17 | process | 635 | 99 |
| 18 | citi | 583 | 70 |
| 19 | urban | 583 | 76 |
| 20 | approach | 576 | 101 |
wordStemPlot <- function(word_stem_data, top_stem_data, year, minimum_occurence, fp_count, op_count) {
cloud_words <- word_stem_data %>%
group_by(word_stem) %>%
tally %>%
filter(n >= minimum_occurence) %>%
arrange(desc(n))
def.par <- par(no.readonly = TRUE)
par(mar = rep(0,4))
layout(mat = matrix(data = c(1,2,3,4), nrow = 2, ncol = 2, byrow = TRUE),
widths = c(lcm(8),lcm(8)),
heights = c(lcm(2),lcm(11)))
# -> nf
#layout.show(nf)
plot.new()
text(0.5, 0.5, paste0("Word stem cloud of AGILE ", year, " Submissions"), font = 2)
text(0.5, 0.15, paste0("Based on ", fp_count, " full papers and ", op_count, " short papers/posters.\n",
"Showing ", nrow(cloud_words), " of ", sum(cloud_words$n),
" word stems occuring at least ", minimum_occurence, " times."), font = 1, cex = 0.7)
plot.new()
text(0.5, 0.5, paste0("Top word stems of AGILE ", year, " Submissions"), font = 2)
text(0.5, 0.15, paste0("Code available at https://github.com/nuest/\nreproducible-research-and-giscience"), font = 1, cex = 0.7)
wordcloud(cloud_words$word_stem, cloud_words$n,
max.words = Inf,
random.order = FALSE,
fixed.asp = FALSE,
rot.per = 0,
color = brewer.pal(8,"Dark2"))
frame() # thx to https://stackoverflow.com/a/25194694/261210
vps <- baseViewports()
pushViewport(vps$inner, vps$figure, vps$plot)
grid.table(as.matrix(top_stem_data),
theme = ttheme_minimal(base_size = 11,
padding = unit(c(10,5), "pt"))
)
popViewport(3)
par(def.par)
}
# minimum occurence manually tested so that all words could be plotted
wordStemPlot(no_stop_stems_2018, top_stems_2018, "2018", 200, full_papers_2018, other_papers_2018)
files_2019 <- dir(path = here::here(data_path, "2019"), pattern = ".pdf$", full.names = TRUE)
This analysis was created with the following 93 documents:
## [1] "/10.1007@978-3-030-14745-71.pdf"
## [2] "/10.1007@978-3-030-14745-710_ocr.pdf"
## [3] "/10.1007@978-3-030-14745-711.pdf"
## [4] "/10.1007@978-3-030-14745-712.pdf"
## [5] "/10.1007@978-3-030-14745-713.pdf"
## [6] "/10.1007@978-3-030-14745-714.pdf"
## [7] "/10.1007@978-3-030-14745-715.pdf"
## [8] "/10.1007@978-3-030-14745-716.pdf"
## [9] "/10.1007@978-3-030-14745-717.pdf"
## [10] "/10.1007@978-3-030-14745-718.pdf"
## [11] "/10.1007@978-3-030-14745-719.pdf"
## [12] "/10.1007@978-3-030-14745-72.pdf"
## [13] "/10.1007@978-3-030-14745-73.pdf"
## [14] "/10.1007@978-3-030-14745-74.pdf"
## [15] "/10.1007@978-3-030-14745-75.pdf"
## [16] "/10.1007@978-3-030-14745-76.pdf"
## [17] "/10.1007@978-3-030-14745-77.pdf"
## [18] "/10.1007@978-3-030-14745-78.pdf"
## [19] "/10.1007@978-3-030-14745-79.pdf"
## [20] "/100_Upload_your_PDF_file.pdf"
## [21] "/101_Upload_your_PDF_file.pdf"
## [22] "/102_Upload_your_PDF_file.pdf"
## [23] "/105_Upload_your_PDF_file.pdf"
## [24] "/106_Upload_your_PDF_file.pdf"
## [25] "/107_Upload_your_PDF_file.pdf"
## [26] "/108_Upload_your_PDF_file.pdf"
## [27] "/109_Upload_your_PDF_file.pdf"
## [28] "/110_Upload_your_PDF_file.pdf"
## [29] "/112_Upload_your_PDF_file.pdf"
## [30] "/113_Upload_your_PDF_file.pdf"
## [31] "/114_Upload_your_PDF_file.pdf"
## [32] "/116_Upload_your_PDF_file.pdf"
## [33] "/118_Upload_your_PDF_file.pdf"
## [34] "/123_Upload_your_PDF_file.pdf"
## [35] "/127_Upload_your_PDF_file.pdf"
## [36] "/131_Upload_your_PDF_file.pdf"
## [37] "/132_Upload_your_PDF_file.pdf"
## [38] "/34_Upload_your_PDF_file.pdf"
## [39] "/35_Upload_your_PDF_file.pdf"
## [40] "/38_Upload_your_PDF_file.pdf"
## [41] "/39_Upload_your_PDF_file.pdf"
## [42] "/40_Upload_your_PDF_file.pdf"
## [43] "/43_Upload_your_PDF_file.pdf"
## [44] "/44_Upload_your_PDF_file.pdf"
## [45] "/45_Upload_your_PDF_file.pdf"
## [46] "/46_Upload_your_PDF_file.pdf"
## [47] "/47_Upload_your_PDF_file.pdf"
## [48] "/50_Upload_your_PDF_file.pdf"
## [49] "/51_Upload_your_PDF_file.pdf"
## [50] "/52_Upload_your_PDF_file.pdf"
## [51] "/53_Upload_your_PDF_file.pdf"
## [52] "/54_Upload_your_PDF_file.pdf"
## [53] "/56_Upload_your_PDF_file.pdf"
## [54] "/58_Upload_your_PDF_file.pdf"
## [55] "/59_Upload_your_PDF_file.pdf"
## [56] "/6_Upload_your_PDF_file.pdf"
## [57] "/60_Upload_your_PDF_file.pdf"
## [58] "/61_Upload_your_PDF_file.pdf"
## [59] "/62_Upload_your_PDF_file.pdf"
## [60] "/63_Upload_your_PDF_file.pdf"
## [61] "/64_Upload_your_PDF_file.pdf"
## [62] "/65_Upload_your_PDF_file.pdf"
## [63] "/66_Upload_your_PDF_file.pdf"
## [64] "/67_Upload_your_PDF_file.pdf"
## [65] "/68_Upload_your_PDF_file.pdf"
## [66] "/69_Upload_your_PDF_file.pdf"
## [67] "/71_Upload_your_PDF_file.pdf"
## [68] "/72_Upload_your_PDF_file.pdf"
## [69] "/73_Upload_your_PDF_file.pdf"
## [70] "/74_Upload_your_PDF_file.pdf"
## [71] "/75_Upload_your_PDF_file.pdf"
## [72] "/76_Upload_your_PDF_file.pdf"
## [73] "/77_Upload_your_PDF_file.pdf"
## [74] "/78_Upload_your_PDF_file.pdf"
## [75] "/79_Upload_your_PDF_file.pdf"
## [76] "/80_Upload_your_PDF_file.pdf"
## [77] "/81_Upload_your_PDF_file.pdf"
## [78] "/82_Upload_your_PDF_file.pdf"
## [79] "/83_Upload_your_PDF_file.pdf"
## [80] "/84_Upload_your_PDF_file.pdf"
## [81] "/85_Upload_your_PDF_file.pdf"
## [82] "/86_Upload_your_PDF_file.pdf"
## [83] "/87_Upload_your_PDF_file.pdf"
## [84] "/88_Upload_your_PDF_file.pdf"
## [85] "/89_Upload_your_PDF_file.pdf"
## [86] "/90_Upload_your_PDF_file.pdf"
## [87] "/93_Upload_your_PDF_file.pdf"
## [88] "/94_Upload_your_PDF_file.pdf"
## [89] "/95_Upload_your_PDF_file.pdf"
## [90] "/96_Upload_your_PDF_file.pdf"
## [91] "/97_Upload_your_PDF_file.pdf"
## [92] "/98_Upload_your_PDF_file.pdf"
## [93] "/99_Upload_your_PDF_file.pdf"
Count the types of submissions:
full_papers_2019 <- length(str_match(files_2019, "10.100")[!is.na(str_match(files_2019, "10.100"))])
other_papers_2019 <- length(str_match(files_2019, "10.100")[is.na(str_match(files_2019, "10.100"))])
Read the data from PDFs and preprocess to create a tidy data structure without stop words:
texts <- lapply(files_2019, pdf_text)
texts <- unlist(lapply(texts, str_c, collapse = TRUE))
infos <- lapply(files_2019, pdf_info)
tidy_texts_2019 <- tibble(id = make_id(files_2019),
file = files_2019,
text = texts,
pages = map_chr(infos, function(info) {info$pages}))
papers_words <- tidy_texts_2019 %>%
select(file, text) %>%
unnest_tokens(word, text)
suppressWarnings({
no_numbers <- papers_words %>%
filter(is.na(as.numeric(word)))
})
no_stop_words_2019 <- no_numbers %>%
anti_join(all_stop_words, by = "word") %>%
mutate(id = make_id(file))
# https://github.com/juliasilge/tidytext/issues/17
no_stop_stems_2019 <- no_stop_words_2019 %>%
mutate(word_stem = wordStem(word))
About 49 % of the words are considered stop words. There are 15276 unique word stems of 19854 words.
Note: In the original paper corpus there was an issue with reading in one paper, which only had 1 word, 10.1007@978-3-030-14745-710.pdf. Since it was not possible to copy or extract text, it was send through an OCR process (using OCRmyPDF) and the original file renamed to 10.1007@978-3-030-14745-710__pdf:
docker run -v $(pwd)/all-manuscripts/2019:/home/docker -it jbarlow83/ocrmypdf --force-ocr 10.1007@978-3-030-14745-710.pdf 10.1007@978-3-030-14745-710_ocr.pdf
mv all-manuscripts/2019/10.1007@978-3-030-14745-710.pdf all-manuscripts/2019/10.1007@978-3-030-14745-710pdf.orig
How many non-stop words does each document have?
no_stop_words_2019 %>%
group_by(id) %>%
summarise(words = n()) %>%
arrange(desc(words))
| id | words |
| 10.1007@978-3-030-14745-718.pdf | 4583 |
| 10.1007@978-3-030-14745-72.pdf | 4188 |
| 10.1007@978-3-030-14745-715.pdf | 4115 |
| 10.1007@978-3-030-14745-75.pdf | 3721 |
| 10.1007@978-3-030-14745-714.pdf | 3715 |
| 10.1007@978-3-030-14745-716.pdf | 3383 |
| 10.1007@978-3-030-14745-77.pdf | 3328 |
| 10.1007@978-3-030-14745-79.pdf | 3268 |
| 10.1007@978-3-030-14745-74.pdf | 3196 |
| 10.1007@978-3-030-14745-713.pdf | 3168 |
| 10.1007@978-3-030-14745-78.pdf | 3145 |
| 10.1007@978-3-030-14745-710_ocr.pdf | 3065 |
| 10.1007@978-3-030-14745-71.pdf | 3050 |
| 10.1007@978-3-030-14745-719.pdf | 3008 |
| 10.1007@978-3-030-14745-73.pdf | 3002 |
| 10.1007@978-3-030-14745-717.pdf | 2935 |
| 10.1007@978-3-030-14745-712.pdf | 2923 |
| 71_Upload_your_PDF_file.pdf | 2597 |
| 10.1007@978-3-030-14745-711.pdf | 2360 |
| 62_Upload_your_PDF_file.pdf | 2064 |
| 38_Upload_your_PDF_file.pdf | 2050 |
| 10.1007@978-3-030-14745-76.pdf | 2036 |
| 105_Upload_your_PDF_file.pdf | 2030 |
| 51_Upload_your_PDF_file.pdf | 1916 |
| 84_Upload_your_PDF_file.pdf | 1852 |
| 63_Upload_your_PDF_file.pdf | 1841 |
| 72_Upload_your_PDF_file.pdf | 1832 |
| 68_Upload_your_PDF_file.pdf | 1823 |
| 93_Upload_your_PDF_file.pdf | 1807 |
| 53_Upload_your_PDF_file.pdf | 1806 |
| 85_Upload_your_PDF_file.pdf | 1801 |
| 40_Upload_your_PDF_file.pdf | 1797 |
| 107_Upload_your_PDF_file.pdf | 1758 |
| 96_Upload_your_PDF_file.pdf | 1748 |
| 61_Upload_your_PDF_file.pdf | 1730 |
| 52_Upload_your_PDF_file.pdf | 1697 |
| 98_Upload_your_PDF_file.pdf | 1686 |
| 65_Upload_your_PDF_file.pdf | 1664 |
| 87_Upload_your_PDF_file.pdf | 1647 |
| 34_Upload_your_PDF_file.pdf | 1643 |
| 95_Upload_your_PDF_file.pdf | 1619 |
| 45_Upload_your_PDF_file.pdf | 1617 |
| 64_Upload_your_PDF_file.pdf | 1561 |
| 100_Upload_your_PDF_file.pdf | 1559 |
| 102_Upload_your_PDF_file.pdf | 1553 |
| 35_Upload_your_PDF_file.pdf | 1538 |
| 86_Upload_your_PDF_file.pdf | 1530 |
| 54_Upload_your_PDF_file.pdf | 1523 |
| 99_Upload_your_PDF_file.pdf | 1521 |
| 81_Upload_your_PDF_file.pdf | 1514 |
| 6_Upload_your_PDF_file.pdf | 1512 |
| 82_Upload_your_PDF_file.pdf | 1512 |
| 108_Upload_your_PDF_file.pdf | 1511 |
| 59_Upload_your_PDF_file.pdf | 1499 |
| 90_Upload_your_PDF_file.pdf | 1493 |
| 127_Upload_your_PDF_file.pdf | 1490 |
| 79_Upload_your_PDF_file.pdf | 1484 |
| 39_Upload_your_PDF_file.pdf | 1459 |
| 67_Upload_your_PDF_file.pdf | 1458 |
| 75_Upload_your_PDF_file.pdf | 1446 |
| 123_Upload_your_PDF_file.pdf | 1440 |
| 74_Upload_your_PDF_file.pdf | 1440 |
| 118_Upload_your_PDF_file.pdf | 1432 |
| 80_Upload_your_PDF_file.pdf | 1368 |
| 76_Upload_your_PDF_file.pdf | 1349 |
| 94_Upload_your_PDF_file.pdf | 1331 |
| 110_Upload_your_PDF_file.pdf | 1290 |
| 58_Upload_your_PDF_file.pdf | 1253 |
| 109_Upload_your_PDF_file.pdf | 1251 |
| 77_Upload_your_PDF_file.pdf | 1203 |
| 101_Upload_your_PDF_file.pdf | 1198 |
| 78_Upload_your_PDF_file.pdf | 1186 |
| 44_Upload_your_PDF_file.pdf | 1146 |
| 56_Upload_your_PDF_file.pdf | 1116 |
| 50_Upload_your_PDF_file.pdf | 1076 |
| 116_Upload_your_PDF_file.pdf | 1047 |
| 114_Upload_your_PDF_file.pdf | 996 |
| 46_Upload_your_PDF_file.pdf | 987 |
| 132_Upload_your_PDF_file.pdf | 933 |
| 83_Upload_your_PDF_file.pdf | 897 |
| 89_Upload_your_PDF_file.pdf | 804 |
| 69_Upload_your_PDF_file.pdf | 796 |
| 131_Upload_your_PDF_file.pdf | 777 |
| 113_Upload_your_PDF_file.pdf | 747 |
| 66_Upload_your_PDF_file.pdf | 716 |
| 47_Upload_your_PDF_file.pdf | 675 |
| 73_Upload_your_PDF_file.pdf | 662 |
| 106_Upload_your_PDF_file.pdf | 661 |
| 88_Upload_your_PDF_file.pdf | 650 |
| 43_Upload_your_PDF_file.pdf | 638 |
| 60_Upload_your_PDF_file.pdf | 634 |
| 97_Upload_your_PDF_file.pdf | 528 |
| 112_Upload_your_PDF_file.pdf | 458 |
How often do the following terms on reproducible research appear in each paper?
The detection matches full words using regex option \b.
reproduc.*, reproducibility, reproducible, reproduce, reproduction)replicat.*, i.e. replication, replicate)repeatab.*, i.e. repeatability, repeatable)algorithm.*, i.e. algorithms, algorithmic)process.*, i.e. processing, processes, preprocessing)data.*, i.e. dataset(s), database(s))tidy_texts_2019_lower <- str_to_lower(tidy_texts_2019$text)
word_counts <- tibble(
id = tidy_texts_2019$id,
`reproduc..` = str_count(tidy_texts_2019_lower, "\\breproduc.*\\b"),
`replic..` = str_count(tidy_texts_2019_lower, "\\breplicat.*\\b"),
`repeatab..` = str_count(tidy_texts_2019_lower, "\\brepeatab.*\\b"),
`code` = str_count(tidy_texts_2019_lower, "(\\bcode\\b|\\bscript.*\\b|\\bpseudo\ code\\b)"),
`software` = str_count(tidy_texts_2019_lower, "\\bsoftware\\b"),
`algorithm(s)` = str_count(tidy_texts_2019_lower, "\\balgorithm.*\\b"),
`(pre)process..` = str_count(tidy_texts_2019_lower, "(\\bprocess.*\\b|\\bpreprocess.*\\b|\\bpre-process.*\\b)"),
`data.*` = str_count(tidy_texts_2019_lower, "\\bdata.*\\b"),
`result(s)` = str_count(tidy_texts_2019_lower, "\\bresults?\\b"),
`repository/ies` = str_count(tidy_texts_2019_lower, "\\brepositor(y|ies)\\b")
) %>%
mutate(all = rowSums(.[-1]))
word_counts_sums_total_2019 <- word_counts %>%
summarise_if(is.numeric, funs(sum)) %>%
add_column(id = "Total", .before = 0)
rbind(word_counts, word_counts_sums_total_2019)
| id | reproduc.. | replic.. | repeatab.. | code | software | algorithm(s) | (pre)process.. | data.* | result(s) | repository/ies | all |
| 10.1007@978-3-030-14745-71.pdf | 0 | 1 | 0 | 2 | 0 | 0 | 5 | 29 | 14 | 0 | 51 |
| 10.1007@978-3-030-14745-710_ocr.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 65 | 32 | 0 | 98 |
| 10.1007@978-3-030-14745-711.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 67 | 8 | 0 | 77 |
| 10.1007@978-3-030-14745-712.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 6 | 95 | 21 | 0 | 124 |
| 10.1007@978-3-030-14745-713.pdf | 0 | 0 | 0 | 0 | 0 | 10 | 19 | 63 | 12 | 0 | 104 |
| 10.1007@978-3-030-14745-714.pdf | 3 | 0 | 0 | 0 | 12 | 2 | 15 | 39 | 16 | 0 | 87 |
| 10.1007@978-3-030-14745-715.pdf | 0 | 0 | 0 | 4 | 1 | 0 | 4 | 42 | 43 | 0 | 94 |
| 10.1007@978-3-030-14745-716.pdf | 0 | 6 | 0 | 0 | 2 | 0 | 17 | 2 | 7 | 0 | 34 |
| 10.1007@978-3-030-14745-717.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 45 | 12 | 24 | 0 | 83 |
| 10.1007@978-3-030-14745-718.pdf | 0 | 0 | 1 | 0 | 6 | 0 | 11 | 9 | 6 | 0 | 33 |
| 10.1007@978-3-030-14745-719.pdf | 0 | 0 | 0 | 0 | 2 | 1 | 9 | 47 | 5 | 0 | 64 |
| 10.1007@978-3-030-14745-72.pdf | 0 | 0 | 0 | 0 | 0 | 5 | 24 | 29 | 19 | 1 | 78 |
| 10.1007@978-3-030-14745-73.pdf | 0 | 0 | 0 | 1 | 1 | 17 | 13 | 36 | 20 | 0 | 88 |
| 10.1007@978-3-030-14745-74.pdf | 0 | 0 | 0 | 1 | 2 | 8 | 43 | 150 | 0 | 0 | 204 |
| 10.1007@978-3-030-14745-75.pdf | 1 | 0 | 0 | 0 | 0 | 8 | 10 | 53 | 9 | 0 | 81 |
| 10.1007@978-3-030-14745-76.pdf | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 55 | 9 | 0 | 66 |
| 10.1007@978-3-030-14745-77.pdf | 0 | 0 | 0 | 0 | 2 | 14 | 3 | 70 | 12 | 0 | 101 |
| 10.1007@978-3-030-14745-78.pdf | 0 | 0 | 0 | 1 | 1 | 42 | 27 | 3 | 9 | 0 | 83 |
| 10.1007@978-3-030-14745-79.pdf | 0 | 0 | 0 | 1 | 0 | 5 | 3 | 76 | 6 | 0 | 91 |
| 100_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 26 | 2 | 45 | 16 | 0 | 89 |
| 101_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 15 | 7 | 0 | 25 |
| 102_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 4 | 5 | 7 | 37 | 5 | 0 | 59 |
| 105_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 30 | 25 | 0 | 62 |
| 106_Upload_your_PDF_file.pdf | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 32 | 4 | 0 | 38 |
| 107_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 43 | 10 | 0 | 59 |
| 108_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 4 | 6 | 13 | 9 | 0 | 32 |
| 109_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 2 | 1 | 6 | 46 | 0 | 0 | 55 |
| 110_Upload_your_PDF_file.pdf | 1 | 0 | 0 | 1 | 0 | 0 | 14 | 60 | 7 | 0 | 83 |
| 112_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 7 | 5 | 0 | 16 |
| 113_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 2 | 0 | 10 | 14 | 4 | 0 | 31 |
| 114_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 9 | 2 | 4 | 4 | 0 | 20 |
| 116_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 3 | 8 | 3 | 0 | 15 |
| 118_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 7 | 6 | 11 | 6 | 0 | 30 |
| 123_Upload_your_PDF_file.pdf | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 55 | 4 | 0 | 61 |
| 127_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 22 | 8 | 7 | 9 | 0 | 47 |
| 131_Upload_your_PDF_file.pdf | 0 | 0 | 2 | 1 | 1 | 3 | 2 | 1 | 1 | 0 | 11 |
| 132_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 11 | 2 | 0 | 14 |
| 34_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 4 | 0 | 5 | 2 | 7 | 5 | 0 | 23 |
| 35_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 4 | 6 | 13 | 2 | 0 | 25 |
| 38_Upload_your_PDF_file.pdf | 36 | 1 | 0 | 13 | 16 | 0 | 10 | 42 | 16 | 4 | 138 |
| 39_Upload_your_PDF_file.pdf | 1 | 0 | 0 | 1 | 0 | 0 | 8 | 6 | 2 | 0 | 18 |
| 40_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 2 | 3 | 6 | 11 | 2 | 0 | 24 |
| 43_Upload_your_PDF_file.pdf | 37 | 0 | 0 | 2 | 3 | 0 | 1 | 7 | 5 | 1 | 56 |
| 44_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 10 | 1 | 2 | 5 | 0 | 19 |
| 45_Upload_your_PDF_file.pdf | 4 | 0 | 0 | 7 | 3 | 1 | 10 | 70 | 4 | 2 | 101 |
| 46_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 23 | 4 | 0 | 28 |
| 47_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 3 | 9 | 0 | 17 |
| 50_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 23 | 9 | 0 | 44 |
| 51_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 5 | 5 | 1 | 15 | 14 | 0 | 41 |
| 52_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 2 | 1 | 3 | 10 | 7 | 0 | 23 |
| 53_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 26 | 16 | 0 | 47 |
| 54_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 14 | 24 | 4 | 0 | 44 |
| 56_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 2 | 3 | 0 | 12 |
| 58_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 3 | 0 | 2 | 0 | 4 | 3 | 0 | 12 |
| 59_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 27 | 4 | 0 | 34 |
| 6_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 41 | 35 | 8 | 0 | 87 |
| 60_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 2 | 2 | 5 | 23 | 2 | 1 | 35 |
| 61_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 7 | 44 | 6 | 0 | 58 |
| 62_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 10 | 74 | 11 | 0 | 96 |
| 63_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 41 | 3 | 0 | 53 |
| 64_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 14 | 1 | 14 | 14 | 0 | 43 |
| 65_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 2 | 32 | 14 | 56 | 13 | 0 | 117 |
| 66_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 0 | 0 | 12 |
| 67_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 6 | 0 | 1 | 8 | 8 | 0 | 23 |
| 68_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 5 | 3 | 2 | 45 | 87 | 3 | 0 | 145 |
| 69_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | 3 | 0 | 16 |
| 71_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 33 | 8 | 0 | 44 |
| 72_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 101 | 6 | 0 | 114 |
| 73_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 0 | 5 |
| 74_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 15 | 10 | 65 | 24 | 0 | 114 |
| 75_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 1 | 3 | 0 | 7 |
| 76_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 93 | 5 | 0 | 99 |
| 77_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 2 | 1 | 1 | 10 | 21 | 3 | 1 | 39 |
| 78_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 3 | 12 | 7 | 32 | 47 | 0 | 0 | 101 |
| 79_Upload_your_PDF_file.pdf | 0 | 0 | 1 | 0 | 2 | 0 | 14 | 4 | 6 | 0 | 27 |
| 80_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 46 | 17 | 0 | 67 |
| 81_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 15 | 3 | 0 | 22 |
| 82_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 4 | 4 | 50 | 5 | 0 | 63 |
| 83_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 2 | 0 | 12 |
| 84_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 3 | 4 | 3 | 16 | 17 | 0 | 44 |
| 85_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | 3 | 0 | 19 |
| 86_Upload_your_PDF_file.pdf | 36 | 19 | 0 | 0 | 0 | 0 | 6 | 7 | 9 | 0 | 77 |
| 87_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 0 | 26 | 37 | 10 | 0 | 74 |
| 88_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 7 | 0 | 0 | 11 |
| 89_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 5 | 0 | 2 | 24 | 0 | 0 | 31 |
| 90_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 5 | 19 | 8 | 0 | 35 |
| 93_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 25 | 11 | 0 | 42 |
| 94_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 1 | 20 | 3 | 10 | 19 | 0 | 53 |
| 95_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 8 | 7 | 0 | 19 |
| 96_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 4 | 0 | 10 | 83 | 0 | 0 | 97 |
| 97_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 10 | 1 | 0 | 17 |
| 98_Upload_your_PDF_file.pdf | 0 | 0 | 0 | 0 | 0 | 3 | 4 | 96 | 12 | 0 | 115 |
| 99_Upload_your_PDF_file.pdf | 1 | 0 | 0 | 1 | 0 | 0 | 14 | 38 | 2 | 0 | 56 |
| Total | 121 | 28 | 4 | 59 | 123 | 378 | 718 | 2981 | 761 | 10 | 5.18e+03 |
What are top used words (not stems)?
countPapersUsingWord <- function(the_word) {
sapply(the_word, function(w) {
no_stop_words_2019 %>%
filter(word == w) %>%
group_by(id) %>%
count %>%
nrow
})
}
top_words_2019 <- no_stop_words_2019 %>%
group_by(word) %>%
tally %>%
arrange(desc(n)) %>%
head(20) %>%
mutate(`# papers` = countPapersUsingWord(word)) %>%
add_column(place = c(1:nrow(.)), .before = 0)
top_words_2019
| place | word | n | # papers |
| 1 | data | 2723 | 92 |
| 2 | spatial | 1052 | 87 |
| 3 | time | 714 | 77 |
| 4 | information | 712 | 90 |
| 5 | analysis | 642 | 81 |
| 6 | model | 626 | 58 |
| 7 | results | 603 | 83 |
| 8 | study | 553 | 75 |
| 9 | research | 479 | 83 |
| 10 | urban | 474 | 60 |
| 11 | land | 411 | 41 |
| 12 | distance | 402 | 48 |
| 13 | map | 399 | 60 |
| 14 | approach | 375 | 71 |
| 15 | method | 363 | 67 |
| 16 | network | 347 | 49 |
| 17 | process | 345 | 68 |
| 18 | location | 333 | 65 |
| 19 | set | 316 | 72 |
| 20 | agile | 309 | 80 |
What are the top word stems?
countPapersUsingStem <- function(the_stem) {
sapply(the_stem, function(s) {
no_stop_stems_2019 %>%
filter(word_stem == s) %>%
group_by(id) %>%
count %>%
nrow
})
}
top_stems_2019 <- no_stop_stems_2019 %>%
group_by(word_stem) %>%
tally %>%
arrange(desc(n)) %>%
head(20) %>%
mutate(`# papers` = countPapersUsingStem(word_stem)) %>%
add_column(place = c(1:nrow(.)), .before = 0)
top_stems_2019
| place | word_stem | n | # papers |
| 1 | data | 2723 | 92 |
| 2 | spatial | 1120 | 87 |
| 3 | model | 1063 | 74 |
| 4 | result | 856 | 88 |
| 5 | time | 821 | 79 |
| 6 | inform | 742 | 90 |
| 7 | map | 735 | 72 |
| 8 | studi | 730 | 79 |
| 9 | process | 715 | 83 |
| 10 | method | 665 | 81 |
| 11 | analysi | 642 | 81 |
| 12 | locat | 598 | 79 |
| 13 | user | 541 | 52 |
| 14 | urban | 540 | 60 |
| 15 | approach | 531 | 78 |
| 16 | develop | 531 | 80 |
| 17 | research | 520 | 84 |
| 18 | system | 517 | 80 |
| 19 | network | 516 | 58 |
| 20 | geograph | 508 | 72 |
# minimum occurence manually tested so that all words could be plotted
wordStemPlot(no_stop_stems_2019, top_stems_2019, "2019", 145, full_papers_2019, other_papers_2019)
keywords_2018 <- word_counts_sums_total_2018
keywords_2019 <- word_counts_sums_total_2019
names(keywords_2018)[[1]] <- names(keywords_2019)[[1]] <- "year"
keywords_2018$year <- "2018"
keywords_2019$year <- "2019"
rbind(keywords_2018, keywords_2019)
| year | reproduc.. | replic.. | repeatab.. | code | software | algorithm(s) | (pre)process.. | data.* | result(s) | repository/ies | all |
| 2018 | 20 | 12 | 0 | 90 | 207 | 377 | 635 | 3536 | 827 | 17 | 5.72e+03 |
| 2019 | 121 | 28 | 4 | 59 | 123 | 378 | 718 | 2981 | 761 | 10 | 5.18e+03 |
cbind(year = c("2018", "2019"),
round(rbind(
dplyr::bind_cols(keywords_2018[-(1)] / (full_papers_2018 + other_papers_2018)),
dplyr::bind_cols(keywords_2019[-(1)] / (full_papers_2019 + other_papers_2019))
), digits = 2)
)
| year | reproduc.. | replic.. | repeatab.. | code | software | algorithm(s) | (pre)process.. | data.* | result(s) | repository/ies | all |
| 2018 | 0.16 | 0.1 | 0 | 0.72 | 1.66 | 3.02 | 5.08 | 28.3 | 6.62 | 0.14 | 45.8 |
| 2019 | 1.3 | 0.3 | 0.04 | 0.63 | 1.32 | 4.06 | 7.72 | 32 | 8.18 | 0.11 | 55.7 |
cbind(year = c("2018", "2019"),
round(rbind(
dplyr::bind_cols(keywords_2018[-(1)] / nrow(no_stop_words_2018) * 1000),
dplyr::bind_cols(keywords_2019[-(1)] / nrow(no_stop_words_2019) * 1000)
), digits = 2)
)
| year | reproduc.. | replic.. | repeatab.. | code | software | algorithm(s) | (pre)process.. | data.* | result(s) | repository/ies | all |
| 2018 | 0.09 | 0.06 | 0 | 0.42 | 0.97 | 1.77 | 2.98 | 16.6 | 3.89 | 0.08 | 26.9 |
| 2019 | 0.74 | 0.17 | 0.02 | 0.36 | 0.75 | 2.3 | 4.38 | 18.2 | 4.64 | 0.06 | 31.6 |
This document is licensed under a Creative Commons Attribution 4.0 International License.
All contained code is licensed under the Apache License 2.0.
devtools::session_info(include_base = TRUE)
## ─ Session info ──────────────────────────────────────────────────────────
## setting value
## version R version 3.6.0 (2019-04-26)
## os Ubuntu 18.04.2 LTS
## system x86_64, linux-gnu
## ui X11
## language en_GB:en
## collate en_GB.UTF-8
## ctype en_GB.UTF-8
## tz Europe/Berlin
## date 2019-06-27
##
## ─ Packages ──────────────────────────────────────────────────────────────
## package * version date lib source
## askpass 1.1 2019-01-13 [1] CRAN (R 3.6.0)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.4 2019-04-10 [1] CRAN (R 3.6.0)
## base * 3.6.0 2019-05-13 [4] local
## callr 3.2.0 2019-03-15 [1] CRAN (R 3.6.0)
## cli 1.1.0 2019-03-19 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## compiler 3.6.0 2019-05-13 [4] local
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## datasets * 3.6.0 2019-05-13 [4] local
## desc 1.2.0 2018-05-01 [1] CRAN (R 3.6.0)
## devtools * 2.0.2 2019-04-08 [1] CRAN (R 3.6.0)
## digest 0.6.19 2019-05-20 [1] CRAN (R 3.6.0)
## dplyr * 0.8.1 2019-05-14 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fs 1.3.1 2019-05-06 [1] CRAN (R 3.6.0)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.2.0 2019-06-16 [1] CRAN (R 3.6.0)
## ggthemes * 4.2.0 2019-05-13 [1] CRAN (R 3.6.0)
## glue 1.3.1 2019-03-12 [1] CRAN (R 3.6.0)
## googledrive * 0.1.3 2019-01-24 [1] CRAN (R 3.6.0)
## graphics * 3.6.0 2019-05-13 [4] local
## grDevices * 3.6.0 2019-05-13 [4] local
## grid * 3.6.0 2019-05-13 [4] local
## gridBase * 0.4-7 2014-02-24 [1] CRAN (R 3.6.0)
## gridExtra * 2.3 2017-09-09 [1] CRAN (R 3.6.0)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## here * 0.1 2017-05-28 [1] CRAN (R 3.6.0)
## hms 0.4.2 2018-03-10 [1] CRAN (R 3.6.0)
## htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.6.0)
## httr * 1.4.0 2018-12-11 [1] CRAN (R 3.6.0)
## huxtable * 4.6.0 2019-06-24 [1] CRAN (R 3.6.0)
## janeaustenr 0.1.5 2017-06-10 [1] CRAN (R 3.6.0)
## knitr * 1.23 2019-05-18 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [1] CRAN (R 3.6.0)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 3.6.0)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## Matrix 1.2-17 2019-03-22 [1] CRAN (R 3.6.0)
## memoise 1.1.0 2017-04-21 [1] CRAN (R 3.6.0)
## methods * 3.6.0 2019-05-13 [4] local
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## pdftools * 2.2 2019-03-10 [1] CRAN (R 3.6.0)
## pillar 1.4.1 2019-05-28 [1] CRAN (R 3.6.0)
## pkgbuild 1.0.3 2019-03-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.6.0)
## pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.6.0)
## prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.6.0)
## processx 3.3.1 2019-05-08 [1] CRAN (R 3.6.0)
## ps 1.3.0 2018-12-21 [1] CRAN (R 3.6.0)
## purrr * 0.3.2 2019-03-15 [1] CRAN (R 3.6.0)
## qpdf 1.1 2019-03-07 [1] CRAN (R 3.6.0)
## R6 2.4.0 2019-02-14 [1] CRAN (R 3.6.0)
## RColorBrewer * 1.1-2 2014-12-07 [1] CRAN (R 3.6.0)
## Rcpp 1.0.1 2019-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## remotes 2.1.0 2019-06-24 [1] CRAN (R 3.6.0)
## rlang * 0.4.0 2019-06-25 [1] CRAN (R 3.6.0)
## rmarkdown 1.13 2019-05-22 [1] CRAN (R 3.6.0)
## rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.6.0)
## rvest * 0.3.4 2019-05-15 [1] CRAN (R 3.6.0)
## scales 1.0.0 2018-08-09 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## SnowballC * 0.6.0 2019-01-15 [1] CRAN (R 3.6.0)
## stats * 3.6.0 2019-05-13 [4] local
## stringi 1.4.3 2019-03-12 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## testthat 2.1.1 2019-04-23 [1] CRAN (R 3.6.0)
## tibble * 2.1.3 2019-06-06 [1] CRAN (R 3.6.0)
## tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.6.0)
## tidytext * 0.2.1 2019-06-14 [1] CRAN (R 3.6.0)
## tokenizers 0.2.1 2018-03-29 [1] CRAN (R 3.6.0)
## tools 3.6.0 2019-05-13 [4] local
## usethis * 1.5.0 2019-04-07 [1] CRAN (R 3.6.0)
## utils * 3.6.0 2019-05-13 [4] local
## withr 2.1.2 2018-03-15 [1] CRAN (R 3.6.0)
## wordcloud * 2.6 2018-08-24 [1] CRAN (R 3.6.0)
## xfun 0.8 2019-06-25 [1] CRAN (R 3.6.0)
## xml2 * 1.2.0 2018-01-24 [1] CRAN (R 3.6.0)
## yaml 2.2.0 2018-07-25 [1] CRAN (R 3.6.0)
##
## [1] /home/daniel/R/x86_64-pc-linux-gnu-library/3.6
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library