Todd Rimes
6/13/2018
Load the three data sets, sample and combine the data, and finally calculate and plot the frequency distribution of individual words, two-word combos, and three-word combos.
# load the required libraries
library(NLP) # for n-grams
library(tm) # for VCorpus
library(RWeka)
library(ggplot2) # for histograms$ wc -l *.txt 899288 en_US.blogs.txt 1010242 en_US.news.txt 2360148 en_US.twitter.txt 4269678 total
blogSampleSize = 0
twitterSampleSize = 0
newsSampleSize = 0
# If the previously combined sample data exists on disk, load it from disk
if(file.exists("inMemory.rda")) {
## load model
load("inMemory.rda")
} else {
# The combined sample data does not exist, so create it now
blogText = readLines("./final/en_US/en_US.blogs.txt", encoding = "utf-8", skipNul = TRUE)
twitterText = readLines("./final/en_US/en_US.twitter.txt", encoding = "utf-8", skipNul = TRUE)
newsText = readLines("./final/en_US/en_US.news.txt", encoding = "utf-8", skipNul = TRUE)
# limit the sample sizes to 20% of the original files
sample_pct = 0.001
blogSampleSize = round( length(blogText) * sample_pct,0)
blogSample <- blogText[sample(1: length(blogText),blogSampleSize)]
twitterSampleSize = round(length(twitterText) * sample_pct,0)
twitterSample <- twitterText[sample(1:length(twitterText),twitterSampleSize)]
newsSampleSize = round(length(newsText) * sample_pct,0)
newsSample <- newsText[sample(1:length(newsText),newsSampleSize)]
# remove the large corpus files from memory
rm(blogText,twitterText,newsText)
head(blogSample)
head(twitterSample)
head(newsSample)
# Combine the three samples into one
allSample <- rbind(blogSample,twitterSample,newsSample)
# read the sample into an in-memory object, then clean the data
inMemory <- VCorpus(VectorSource(allSample))
# lowercase
inMemory <- tm_map(inMemory, content_transformer(tolower))
# remove punctuation
inMemory <- tm_map(inMemory, removePunctuation)
# remove numbers
inMemory <- tm_map(inMemory, removeNumbers)
# remove 1+ whitespace
inMemory <- tm_map(inMemory, stripWhitespace)
# save to disk
save(inMemory, file="inMemory.rda")
}## [1] "Blog sample count is 899"## [1] "Twitter sample count is 2360"## [1] "News sample count is 1010"
if(file.exists("oneWordMatrix.rda")) {
## load model
load("oneWordMatrix.rda")
} else {
oneWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
oneWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = oneWordTokenizer))
save(oneWordMatrix, file="oneWordMatrix.rda")
}
if(file.exists("topTerms1.rda")) {
## load model
load("topTerms1.rda")
load("wordCounts1.rda")
} else {
topTerms1 <- findFreqTerms(oneWordMatrix, lowfreq = 300)
save(topTerms1, file="topTerms1.rda")
wordCounts1 <- rowSums(as.matrix(oneWordMatrix[topTerms1,]))
wordCounts1 <- data.frame(unigram=names(wordCounts1), frequency=wordCounts1)
save(wordCounts1, file="wordCounts1.rda")
}
g1 <- ggplot(wordCounts1, aes(x=reorder(unigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Unigram") + ylab("Frequency") +
labs(title = "Top unigrams by frequency")
print(g1)if(file.exists("twoWordMatrix.rda")) {
## load model
load("twoWordMatrix.rda")
} else {
twoWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
twoWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = twoWordTokenizer))
save(twoWordMatrix, file="twoWordMatrix.rda")
}
if(file.exists("topTerms2.rda")) {
## load model
load("topTerms2.rda")
load("wordCounts2.rda")
} else {
topTerms2 <- findFreqTerms(twoWordMatrix, lowfreq = 125)
save(topTerms2, file="topTerms2.rda")
wordCounts2 <- rowSums(as.matrix(twoWordMatrix[topTerms2,]))
wordCounts2 <- data.frame(bigram=names(wordCounts2), frequency=wordCounts2)
save(wordCounts2, file="wordCounts2.rda")
}
g2 <- ggplot(wordCounts2, aes(x=reorder(bigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Bigram") + ylab("Frequency") +
labs(title = "Top bigrams by frequency")
print(g2)if(file.exists("threeWordMatrix.rda")) {
## load model
load("threeWordMatrix.rda")
} else {
threeWordTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
threeWordMatrix <- TermDocumentMatrix(inMemory, control = list(tokenize = threeWordTokenizer))
save(threeWordMatrix, file="threeWordMatrix.rda")
}
if(file.exists("topTerms3.rda")) {
## load model
load("topTerms3.rda")
load("wordCounts3.rda")
} else {
topTerms3 <- findFreqTerms(threeWordMatrix, lowfreq = 20)
save(topTerms3, file="topTerms3.rda")
wordCounts3 <- rowSums(as.matrix(threeWordMatrix[topTerms3,]))
wordCounts3 <- data.frame(trigram=names(wordCounts3), frequency=wordCounts3)
save(wordCounts3, file="wordCounts3.rda")
}
g3 <- ggplot(wordCounts3, aes(x=reorder(trigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Trigram") + ylab("Frequency") +
labs(title = "Top trigrams by frequency")
print(g3)