# Read in news, blogs and twitter data
newsData <- readLines("en_US.news.txt", , encoding = "UTF-8", skipNul = TRUE)
blogsData <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
twitterData <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Obtain and print basic information on three files
library(stringi)
# File Sizes in MB's
newsDataSize <- file.info("en_US.news.txt")$size / 1024^2
blogsDataSize <- file.info("en_US.blogs.txt")$size / 1024^2
twitterDataSize <- file.info("en_US.twitter.txt")$size / 1024^2
# Number of words in each file
newsDataWords <- stri_count_words(newsData)
blogsDataWords <- stri_count_words(blogsData)
twitterDataWords <- stri_count_words(twitterData)
# Print table with basic summary of the three files - news, blogs and twitter
data.frame(source = c("news","blogs", "twitter"), file_size_MB = c(newsDataSize, blogsDataSize, twitterDataSize), num_lines = c(length(newsData), length(blogsData), length(twitterData)), num_words = c(sum(newsDataWords), sum(blogsDataWords), sum(twitterDataWords)))
## source file_size_MB num_lines num_words
## 1 news 196.2775 77259 2674536
## 2 blogs 200.4242 899288 37546246
## 3 twitter 159.3641 2360148 30093410
# load text mining and NLP packages
library(tm)
library(NLP)
# Create 2% of sample for Corpus
set.seed(4234)
dataSample1 <- c(sample(newsData, length(newsData) *0.02), sample(blogsData, length(blogsData) *0.02), sample(twitterData, length(twitterData) *0.02))
# remove non-English charcters from data sample using "iconv"
dataSample2 <- iconv(dataSample1, "latin1", "ASCII", sub="")
#create corpus
corpus1 <- VCorpus(VectorSource(dataSample2))
# profanity removal list - use Carnegie Mellon University's resource: 'Offensive/Profane Word List' (https://www.cs.cmu.edu/~biglou/resources/bad-words.txt)
download.file("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt", destfile = "bad-words.txt")
profane_words <- read.delim("bad-words.txt", sep=":", header=FALSE)
profane_words <- profane_words[,1]
# remove additional whitespace
corpus1 <- tm_map(corpus1, stripWhitespace)
# remove numbers to create a text document
corpus1 <- tm_map(corpus1, removeNumbers)
# remove puncuation marks
corpus1 <- tm_map(corpus1, removePunctuation)
# remove profane words referenced in custom list
corpus1 <- tm_map(corpus1, removeWords, profane_words)
# transform to plain text document
corpus1 <- tm_map(corpus1, PlainTextDocument)
library(ggplot2)
library(RWeka)
# Tokenizer function for n-grams.
# Unigram tokenizer - 1 word
unigramToken <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
# Bigram tokenizer - 2 words
bigramToken <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
# Trigram tokenizer - 3 words
trigramToken <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
options(mc.cores = 1)
# helper function to find frequency of n-grams in the corpus
topFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
freq_dataframe <- (data.frame(word = names(freq), freq = freq))
return(freq_dataframe)
}
# Create files with n-gram frequency - most common unigrams, bigrams and trigrams - for plotting.
# Unigram frequency
unigram_freq <- TermDocumentMatrix(corpus1, control=list(tokenize=unigramToken))
unigram_freq2 <- removeSparseTerms(unigram_freq, 0.99)
uniCorpus_freq3 <- topFreq(unigram_freq2)
# bigram frequency
bigram_freq <- TermDocumentMatrix(corpus1, control=list(tokenize=bigramToken ))
bigram_freq2 <- removeSparseTerms(bigram_freq, 0.999)
biCorpus_freq3 <- topFreq(bigram_freq2)
# trigram frequency
trigram_freq <- TermDocumentMatrix(corpus1, control=list(tokenize=trigramToken))
trigram_freq2 <- removeSparseTerms(trigram_freq, 0.9999)
triCorpus_freq3 <- topFreq(trigram_freq2)
# plot top 20 unigrams
n_gram_chart <- ggplot(uniCorpus_freq3[1:20,], aes(x=reorder(word, freq), y =freq)) +
theme(legend.title = element_blank()) +
geom_bar(stat="identity", fill = "coral2") + coord_flip() + labs(title= "Top 20 Unigrams (1 word)") +
xlab("Word") +
ylab("Unigram Frequency")
print(n_gram_chart)
# plot top 20 bigrams
bi_gram_chart <- ggplot(biCorpus_freq3[1:20,], aes(x=reorder(word, freq), y =freq)) +
theme(legend.title = element_blank()) +
geom_bar(stat="identity", fill = "springgreen4") + coord_flip() + labs(title= "Top 20 Bigrams (2 word phrases)") +
xlab("Phrases") +
ylab("Bigram Frequency")
print(bi_gram_chart)
# plot top 20 trigrams
tri_gram_chart <- ggplot(triCorpus_freq3[1:20,], aes(x=reorder(word, freq), y =freq)) +
theme(legend.title = element_blank()) +
geom_bar(stat="identity", fill = "dodgerblue3") + coord_flip() + labs(title= "Top 20 Trigrams (3 words phrases)") +
xlab("Phrases") +
ylab("Trigram Frequency")
print(tri_gram_chart)