Analysis Quotes of Libertarians

Load library

library(readr)
library(dplyr)
library(wordcloud)
library(tokenizers)
library(openNLP)
library(NLP)
library(readr)

Custom Functions

extractPOS <- function(x, thisPOSregex) {
  x <- as.String(x)
  wordAnnotation <- annotate(x, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator()))
  POSAnnotation <- annotate(x, Maxent_POS_Tag_Annotator(), wordAnnotation)
  POSwords <- subset(POSAnnotation, type == "word")
  tags <- sapply(POSwords$features, '[[', "POS")
  thisPOSindex <- grep(thisPOSregex, tags)
  tokenizedAndTagged <- sprintf("%s/%s", x[POSwords][thisPOSindex], tags[thisPOSindex])
  untokenizedAndTagged <- paste(tokenizedAndTagged, collapse = " ")
  untokenizedAndTagged
}

Getting and Cleaning Data

Data is from topfamousquotes, Cleaning data with OpenRefine.

fn <- 'data/noauthor.txt'
text <- paste(readLines(fn,encoding = 'UTF-8'), collapse = " ")
text_noun <- unlist(lapply(text,extractPOS,"NN"))

tws <- tokenize_word_stems(text_noun,stopwords = c("nn","nns","nnp"))
tws_df = data_frame(word = names(table(tws)),freq = table(tws)) %>% arrange(desc(freq))
tws_df = tws_df[-1,] #For Exlude 'libertarian'

tws_df %>% filter(freq > 4)

word	freq
peopl	12
state	11
liber	10
govern	9
parti	9
freedom	8
i	8
order	6
way	6
adult	5
conserv	5
conservat	5
democrat	5
jesus	5
liberti	5
republican	5
societi	5

Make default wordcloud

pal <- c("#ADD5F7","#7FB2F0","#4E7AC7","#35478C")

wordcloud(words = tws_df$word, freq = tws_df$freq, max.words = 30,random.order = TRUE, colors = pal)