提取某一类词性的单词

# load library
library(NLP)
library(openNLP)

# <U+4E00><U+4E2A><U+989D><U+5916><U+7F16><U+5199><U+7684><U+51FD><U+6570>
tagPOS <- function(x, ...) {
    s <- as.String(x)
    word_token_annotator <- Maxent_Word_Token_Annotator()
    a2 <- Annotation(1L, "sentence", 1L, nchar(s))
    a2 <- annotate(s, word_token_annotator, a2)
    a3 <- annotate(s, Maxent_POS_Tag_Annotator(), a2)
    a3w <- a3[a3$type == "word"]
    POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
    list(POStagged = POStagged, POStags = POStags)
}

# <U+8BFB><U+53D6><U+6570><U+636E>
dlm_OpenNLR <- function(txt_name) {
    # path_data <- './data/'
    path_result <- "./result/"
    # txt_data <- paste0(path_data,txt_name)
    txt <- readLines(txt_name)
    
    acqTag <- tagPOS(txt)
    
    # verbs if (char == 'V')
    {
        txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/VB.?"), function(x) sub("(^.*\\s)(\\w+$)", 
            "\\2", x))
        txt_vb_unique <- unique(txt_vb)
        # <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
        path_save <- paste0(path_result, journal, "_VB.txt")
        write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE, 
            quote = F, append = T)
    }
    
    # nons if (char == 'N')
    {
        txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/N.?"), function(x) sub("(^.*\\s)(\\w+$)", 
            "\\2", x))
        txt_vb_unique <- unique(txt_vb)
        # <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
        path_save <- paste0(path_result, journal, "_NN.txt")
        write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE, 
            quote = F, append = T)
    }
    
    # adjective if (char == 'ADJ')
    {
        txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/J.?"), function(x) sub("(^.*\\s)(\\w+$)", 
            "\\2", x))
        txt_vb_unique <- unique(txt_vb)
        # <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
        path_save <- paste0(path_result, journal, "_ADJ.txt")
        write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE, 
            quote = F, append = T)
    }
    
    # adverb if (char == 'ADV')
    {
        txt_vb <- sapply(strsplit(acqTag$POStagged, "[[:punct:]]*/R.?"), function(x) sub("(^.*\\s)(\\w+$)", 
            "\\2", x))
        txt_vb_unique <- unique(txt_vb)
        # <U+600E><U+4E48><U+67E5><U+4E00><U+4E2A>character<U+7684><U+957F><U+5EA6>
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) < 20]
        txt_vb_unique <- txt_vb_unique[nchar(txt_vb_unique) > 2]
        path_save <- paste0(path_result, journal, "_ADV.txt")
        write.table(txt_vb_unique, path_save, row.names = FALSE, col.names = FALSE, 
            quote = F, append = T)
    }
}
# folder with 1000s of PDFs
dest <- "D:/Sync/Mendeley_PDF/Unknown"

# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)

# convert each PDF file that is named in the vector into a text file text
# file is created in the same directory as the PDFs note that my
# pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste("\"D:/Program Files/xpdf/pdftotext.exe\"", 
    paste0("\"", i, "\"")), wait = FALSE))
## [[1]]
## [1] 0
## 
## [[2]]
## [1] 0
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 0
## 
## [[5]]
## [1] 0
## 
## [[6]]
## [1] 0
## 
## [[7]]
## [1] 0
## 
## [[8]]
## [1] 0
## 
## [[9]]
## [1] 0
## 
## [[10]]
## [1] 0
## 
## [[11]]
## [1] 0
## 
## [[12]]
## [1] 0
## 
## [[13]]
## [1] 0
## 
## [[14]]
## [1] 0
## 
## [[15]]
## [1] 0
## 
## [[16]]
## [1] 0
## 
## [[17]]
## [1] 0
## 
## [[18]]
## [1] 0
## 
## [[19]]
## [1] 0
## 
## [[20]]
## [1] 0
## 
## [[21]]
## [1] 0
## 
## [[22]]
## [1] 0
## 
## [[23]]
## [1] 0
## 
## [[24]]
## [1] 0
## 
## [[25]]
## [1] 0
## 
## [[26]]
## [1] 0
## 
## [[27]]
## [1] 0
## 
## [[28]]
## [1] 0
## 
## [[29]]
## [1] 0
## 
## [[30]]
## [1] 0
## 
## [[31]]
## [1] 0
## 
## [[32]]
## [1] 0
## 
## [[33]]
## [1] 0
## 
## [[34]]
## [1] 0
## 
## [[35]]
## [1] 0
## 
## [[36]]
## [1] 0
## 
## [[37]]
## [1] 0
## 
## [[38]]
## [1] 0
## 
## [[39]]
## [1] 0
## 
## [[40]]
## [1] 0
## 
## [[41]]
## [1] 0
## 
## [[42]]
## [1] 0
## 
## [[43]]
## [1] 0
## 
## [[44]]
## [1] 0
## 
## [[45]]
## [1] 0
## 
## [[46]]
## [1] 0
## 
## [[47]]
## [1] 0
## 
## [[48]]
## [1] 0
## 
## [[49]]
## [1] 0
## 
## [[50]]
## [1] 0
## 
## [[51]]
## [1] 0
## 
## [[52]]
## [1] 0
## 
## [[53]]
## [1] 0
## 
## [[54]]
## [1] 0
## 
## [[55]]
## [1] 0
## 
## [[56]]
## [1] 0
## 
## [[57]]
## [1] 0
## 
## [[58]]
## [1] 0
## 
## [[59]]
## [1] 0
## 
## [[60]]
## [1] 0
## 
## [[61]]
## [1] 0
## 
## [[62]]
## [1] 0
## 
## [[63]]
## [1] 0
## 
## [[64]]
## [1] 0
## 
## [[65]]
## [1] 0
## 
## [[66]]
## [1] 0
## 
## [[67]]
## [1] 0
## 
## [[68]]
## [1] 0
## 
## [[69]]
## [1] 0
## 
## [[70]]
## [1] 0
## 
## [[71]]
## [1] 0
## 
## [[72]]
## [1] 0
## 
## [[73]]
## [1] 0
## 
## [[74]]
## [1] 0
## 
## [[75]]
## [1] 0
## 
## [[76]]
## [1] 0
## 
## [[77]]
## [1] 0
## 
## [[78]]
## [1] 0
## 
## [[79]]
## [1] 0
## 
## [[80]]
## [1] 0
## 
## [[81]]
## [1] 0
## 
## [[82]]
## [1] 0
## 
## [[83]]
## [1] 0
## 
## [[84]]
## [1] 0
## 
## [[85]]
## [1] 0
## 
## [[86]]
## [1] 0
## 
## [[87]]
## [1] 0
## 
## [[88]]
## [1] 0
## 
## [[89]]
## [1] 0
## 
## [[90]]
## [1] 0
## 
## [[91]]
## [1] 0
## 
## [[92]]
## [1] 0
## 
## [[93]]
## [1] 0
## 
## [[94]]
## [1] 0
## 
## [[95]]
## [1] 0
## 
## [[96]]
## [1] 0
## 
## [[97]]
## [1] 0
## 
## [[98]]
## [1] 0
## 
## [[99]]
## [1] 0
## 
## [[100]]
## [1] 0
## 
## [[101]]
## [1] 0
## 
## [[102]]
## [1] 0
## 
## [[103]]
## [1] 0
## 
## [[104]]
## [1] 0
## 
## [[105]]
## [1] 0
## 
## [[106]]
## [1] 0
## 
## [[107]]
## [1] 0
## 
## [[108]]
## [1] 0
## 
## [[109]]
## [1] 0
## 
## [[110]]
## [1] 0
## 
## [[111]]
## [1] 0
## 
## [[112]]
## [1] 0
## 
## [[113]]
## [1] 0
## 
## [[114]]
## [1] 0
## 
## [[115]]
## [1] 0
## 
## [[116]]
## [1] 0
## 
## [[117]]
## [1] 0
## 
## [[118]]
## [1] 0
## 
## [[119]]
## [1] 0
## 
## [[120]]
## [1] 0
## 
## [[121]]
## [1] 0
## 
## [[122]]
## [1] 0
## 
## [[123]]
## [1] 0
## 
## [[124]]
## [1] 0
## 
## [[125]]
## [1] 0
## 
## [[126]]
## [1] 0
## 
## [[127]]
## [1] 0
## 
## [[128]]
## [1] 0
## 
## [[129]]
## [1] 0
## 
## [[130]]
## [1] 0
## 
## [[131]]
## [1] 0
## 
## [[132]]
## [1] 0
## 
## [[133]]
## [1] 0
## 
## [[134]]
## [1] 0
## 
## [[135]]
## [1] 0
## 
## [[136]]
## [1] 0
## 
## [[137]]
## [1] 0

将pdf转换成TXT并提取动词

转换成TXT并读取

# <U+76EE><U+6807><U+6587><U+4EF6><U+5939>folder with 1000s of PDFs
journal <- "New Phytologist"
dest <- paste0("D:/Sync/Mendeley_PDF/", journal)

# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)

# convert each PDF file that is named in the vector into a text file text
# file is created in the same directory as the PDFs note that my
# pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste("\"D:/Program Files/xpdf/pdftotext.exe\"", 
    paste0("\"", i, "\"")), wait = FALSE))
## [[1]]
## [1] 0
## 
## [[2]]
## [1] 0
## 
## [[3]]
## [1] 0
## 
## [[4]]
## [1] 0
## 
## [[5]]
## [1] 0
## 
## [[6]]
## [1] 0
## 
## [[7]]
## [1] 0
## 
## [[8]]
## [1] 0
## 
## [[9]]
## [1] 0
## 
## [[10]]
## [1] 0
## 
## [[11]]
## [1] 0
## 
## [[12]]
## [1] 0
## 
## [[13]]
## [1] 0
## 
## [[14]]
## [1] 0
## 
## [[15]]
## [1] 0
# <U+8BFB><U+53D6>TXT
mytxtfiles <- list.files(path = dest, pattern = "txt", full.names = TRUE)

提取动词

# for(i in 1:length(mytxtfiles)){ lapply(mytxtfiles[i],
# FUN=function(txt_name) dlm_OpenNLR(txt_name)) }