提取某一类词性的单词
# load library
library(NLP)
library(openNLP)
# <U+4E00><U+4E2A><U+989D><U+5916><U+7F16><U+5199><U+7684><U+51FD><U+6570>
tagPOS <- function(x, ...) {
s <- as.String(x)
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- Annotation(1L, "sentence", 1L, nchar(s))
a2 <- annotate(s, word_token_annotator, a2)
a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
list(POStagged = POStagged, POStags = POStags)
}
# <U+8BFB><U+53D6><U+6570><U+636E>
dlm_OpenNLR <- function(txt_name) {
# path_data <- './data/'
path_result <- "./result/"
# txt_data <- paste0(path_data,txt_name)
txt <- readLines(txt_name)
acqTag <- tagPOS(txt)
# verbs if (char == 'V')
{
txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/VB.?"), function(x) sub("(^.*\\s)(\\w+$)",
"\\2", x))
txt_vb_unique <- unique(txt_vb)
# <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
path_save <- paste0(path_result, journal, "_VB.txt")
write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE,
quote = F, append = T)
}
# nons if (char == 'N')
{
txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/N.?"), function(x) sub("(^.*\\s)(\\w+$)",
"\\2", x))
txt_vb_unique <- unique(txt_vb)
# <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
path_save <- paste0(path_result, journal, "_NN.txt")
write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE,
quote = F, append = T)
}
# adjective if (char == 'ADJ')
{
txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/J.?"), function(x) sub("(^.*\\s)(\\w+$)",
"\\2", x))
txt_vb_unique <- unique(txt_vb)
# <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
path_save <- paste0(path_result, journal, "_ADJ.txt")
write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE,
quote = F, append = T)
}
# adverb if (char == 'ADV')
{
txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/R.?"), function(x) sub("(^.*\\s)(\\w+$)",
"\\2", x))
txt_vb_unique <- unique(txt_vb)
# <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
path_save <- paste0(path_result, journal, "_ADV.txt")
write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE,
quote = F, append = T)
}
}
# folder with 1000s of PDFs
dest <- "D:/Sync/Mendeley_PDF/Unknown"
# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)
# convert each PDF file that is named in the vector into a text file text
# file is created in the same directory as the PDFs note that my
# pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste("\"D:/Program Files/xpdf/pdftotext.exe\"",
paste0("\"", i, "\"")), wait = FALSE))
## [[1]]
## [1] 0
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 0
##
## [[5]]
## [1] 0
##
## [[6]]
## [1] 0
##
## [[7]]
## [1] 0
##
## [[8]]
## [1] 0
##
## [[9]]
## [1] 0
##
## [[10]]
## [1] 0
##
## [[11]]
## [1] 0
##
## [[12]]
## [1] 0
##
## [[13]]
## [1] 0
##
## [[14]]
## [1] 0
##
## [[15]]
## [1] 0
##
## [[16]]
## [1] 0
##
## [[17]]
## [1] 0
##
## [[18]]
## [1] 0
##
## [[19]]
## [1] 0
##
## [[20]]
## [1] 0
##
## [[21]]
## [1] 0
##
## [[22]]
## [1] 0
##
## [[23]]
## [1] 0
##
## [[24]]
## [1] 0
##
## [[25]]
## [1] 0
##
## [[26]]
## [1] 0
##
## [[27]]
## [1] 0
##
## [[28]]
## [1] 0
##
## [[29]]
## [1] 0
##
## [[30]]
## [1] 0
##
## [[31]]
## [1] 0
##
## [[32]]
## [1] 0
##
## [[33]]
## [1] 0
##
## [[34]]
## [1] 0
##
## [[35]]
## [1] 0
##
## [[36]]
## [1] 0
##
## [[37]]
## [1] 0
##
## [[38]]
## [1] 0
##
## [[39]]
## [1] 0
##
## [[40]]
## [1] 0
##
## [[41]]
## [1] 0
##
## [[42]]
## [1] 0
##
## [[43]]
## [1] 0
##
## [[44]]
## [1] 0
##
## [[45]]
## [1] 0
##
## [[46]]
## [1] 0
##
## [[47]]
## [1] 0
##
## [[48]]
## [1] 0
##
## [[49]]
## [1] 0
##
## [[50]]
## [1] 0
##
## [[51]]
## [1] 0
##
## [[52]]
## [1] 0
##
## [[53]]
## [1] 0
##
## [[54]]
## [1] 0
##
## [[55]]
## [1] 0
##
## [[56]]
## [1] 0
##
## [[57]]
## [1] 0
##
## [[58]]
## [1] 0
##
## [[59]]
## [1] 0
##
## [[60]]
## [1] 0
##
## [[61]]
## [1] 0
##
## [[62]]
## [1] 0
##
## [[63]]
## [1] 0
##
## [[64]]
## [1] 0
##
## [[65]]
## [1] 0
##
## [[66]]
## [1] 0
##
## [[67]]
## [1] 0
##
## [[68]]
## [1] 0
##
## [[69]]
## [1] 0
##
## [[70]]
## [1] 0
##
## [[71]]
## [1] 0
##
## [[72]]
## [1] 0
##
## [[73]]
## [1] 0
##
## [[74]]
## [1] 0
##
## [[75]]
## [1] 0
##
## [[76]]
## [1] 0
##
## [[77]]
## [1] 0
##
## [[78]]
## [1] 0
##
## [[79]]
## [1] 0
##
## [[80]]
## [1] 0
##
## [[81]]
## [1] 0
##
## [[82]]
## [1] 0
##
## [[83]]
## [1] 0
##
## [[84]]
## [1] 0
##
## [[85]]
## [1] 0
##
## [[86]]
## [1] 0
##
## [[87]]
## [1] 0
##
## [[88]]
## [1] 0
##
## [[89]]
## [1] 0
##
## [[90]]
## [1] 0
##
## [[91]]
## [1] 0
##
## [[92]]
## [1] 0
##
## [[93]]
## [1] 0
##
## [[94]]
## [1] 0
##
## [[95]]
## [1] 0
##
## [[96]]
## [1] 0
##
## [[97]]
## [1] 0
##
## [[98]]
## [1] 0
##
## [[99]]
## [1] 0
##
## [[100]]
## [1] 0
##
## [[101]]
## [1] 0
##
## [[102]]
## [1] 0
##
## [[103]]
## [1] 0
##
## [[104]]
## [1] 0
##
## [[105]]
## [1] 0
##
## [[106]]
## [1] 0
##
## [[107]]
## [1] 0
##
## [[108]]
## [1] 0
##
## [[109]]
## [1] 0
##
## [[110]]
## [1] 0
##
## [[111]]
## [1] 0
##
## [[112]]
## [1] 0
##
## [[113]]
## [1] 0
##
## [[114]]
## [1] 0
##
## [[115]]
## [1] 0
##
## [[116]]
## [1] 0
##
## [[117]]
## [1] 0
##
## [[118]]
## [1] 0
##
## [[119]]
## [1] 0
##
## [[120]]
## [1] 0
##
## [[121]]
## [1] 0
##
## [[122]]
## [1] 0
##
## [[123]]
## [1] 0
##
## [[124]]
## [1] 0
##
## [[125]]
## [1] 0
##
## [[126]]
## [1] 0
##
## [[127]]
## [1] 0
##
## [[128]]
## [1] 0
##
## [[129]]
## [1] 0
##
## [[130]]
## [1] 0
##
## [[131]]
## [1] 0
##
## [[132]]
## [1] 0
##
## [[133]]
## [1] 0
##
## [[134]]
## [1] 0
##
## [[135]]
## [1] 0
##
## [[136]]
## [1] 0
##
## [[137]]
## [1] 0
将pdf转换成TXT并提取动词
转换成TXT并读取
# <U+76EE><U+6807><U+6587><U+4EF6><U+5939>folder with 1000s of PDFs
journal <- "New Phytologist"
dest <- paste0("D:/Sync/Mendeley_PDF/", journal)
# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)
# convert each PDF file that is named in the vector into a text file text
# file is created in the same directory as the PDFs note that my
# pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste("\"D:/Program Files/xpdf/pdftotext.exe\"",
paste0("\"", i, "\"")), wait = FALSE))
## [[1]]
## [1] 0
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 0
##
## [[5]]
## [1] 0
##
## [[6]]
## [1] 0
##
## [[7]]
## [1] 0
##
## [[8]]
## [1] 0
##
## [[9]]
## [1] 0
##
## [[10]]
## [1] 0
##
## [[11]]
## [1] 0
##
## [[12]]
## [1] 0
##
## [[13]]
## [1] 0
##
## [[14]]
## [1] 0
##
## [[15]]
## [1] 0
# <U+8BFB><U+53D6>TXT
mytxtfiles <- list.files(path = dest, pattern = "txt", full.names = TRUE)
提取动词
# for(i in 1:length(mytxtfiles)){ lapply(mytxtfiles[i],
# FUN=function(txt_name) dlm_OpenNLR(txt_name)) }