Count of X-references in UN administrative documents
library(tm)
## Loading required package: NLP
library(stringr)
files<- file.path ("C:/Users/ambra/Desktop/DM/DM2")
##Read all PDF files (hundreds, the script takes several mins) and create the corpus
documents<- Corpus(DirSource(files), readerControl = list(reader=readPDF))
##Build dictionary and format
dict2<- read.csv("C:\\Users\\ambra\\Desktop\\DM\\DMdocslist.csv", header=F)$V1
dict3<- as.character(dict2)
dict3<- sapply(dict3, tolower)
##Clean corpus text through tm_map
documents1<-tm_map(documents, content_transformer(function(x) str_replace_all(x, "(//. )$", " ")))
documents2<- tm_map(documents1, content_transformer(function(x) str_replace_all(x, "(^[ [.,:-;]])|[[,:;]]|\\f", " ")))
corpus.tdm <- TermDocumentMatrix(documents2, list(dictionary=dict3))
##write whole TDM
d<- as.data.frame(as.matrix(corpus.tdm))
##Drop columns whose sum=0 because no relevant matches were found
y<- d[, colSums(d) !=0]
write.csv(y, file="C:/Users/ambra/Desktop/DM/Xreferences/XreferencesMarch1.csv")