extractPOS <- function(x, thisPOSregex) {
x <- as.String(x)
wordAnnotation <- annotate(x, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator()))
POSAnnotation <- annotate(x, Maxent_POS_Tag_Annotator(), wordAnnotation)
POSwords <- subset(POSAnnotation, type == "word")
tags <- sapply(POSwords$features, '[[', "POS")
thisPOSindex <- grep(thisPOSregex, tags)
tokenizedAndTagged <- sprintf("%s/%s", x[POSwords][thisPOSindex], tags[thisPOSindex])
untokenizedAndTagged <- paste(tokenizedAndTagged, collapse = " ")
untokenizedAndTagged
}Data is from topfamousquotes, Cleaning data with OpenRefine.
fn <- 'data/noauthor.txt'
text <- paste(readLines(fn,encoding = 'UTF-8'), collapse = " ")
text_noun <- unlist(lapply(text,extractPOS,"NN"))tws <- tokenize_word_stems(text_noun,stopwords = c("nn","nns","nnp"))
tws_df = data_frame(word = names(table(tws)),freq = table(tws)) %>% arrange(desc(freq))
tws_df = tws_df[-1,] #For Exlude 'libertarian'| word | freq |
|---|---|
| peopl | 12 |
| state | 11 |
| liber | 10 |
| govern | 9 |
| parti | 9 |
| freedom | 8 |
| i | 8 |
| order | 6 |
| way | 6 |
| adult | 5 |
| conserv | 5 |
| conservat | 5 |
| democrat | 5 |
| jesus | 5 |
| liberti | 5 |
| republican | 5 |
| societi | 5 |