The aim of this report is to explain our exploratory analysis and our goals for the eventual app and algorithm.
library(stringi, quietly = TRUE)
library(knitr, quietly = TRUE)
library(tm, quietly = TRUE)
library(SnowballC, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(wordcloud, quietly = TRUE)
library(ggplot2, quietly = TRUE)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ggpubr, quietly = TRUE)
library(dplyr, quietly = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats, quietly = TRUE)
library(RWeka, quietly = TRUE)
path1 = 'final/en_US/en_US.blogs.txt'
path2 = 'final/en_US/en_US.twitter.txt'
path3 = 'final/en_US/en_US.news.txt'
con = file(path1, open = 'rb')
blogs = readLines(con, encoding = 'UTF-8')
close(con)
con = file(path2, open = 'rb')
twitter = readLines(con, encoding = 'UTF-8')
close(con)
con = file(path3, open = 'rb')
news = readLines(con, encoding = 'UTF-8')
close(con)
table_stats <- data.frame(
fileName = c("en_US.blogs",
"en_US.twitter",
"en_US.news"),
fileSize = c(file.info(path1)$size/1024^2,
file.info(path2)$size/1024^2,
file.info(path3)$size/1024^2),
t(rbind(sapply(list(blogs, twitter, news), stri_stats_general),
WordCount = sapply(list(blogs, twitter, news), stri_stats_latex)[4,]))
)
kable(table_stats)
| fileName | fileSize | Lines | LinesNEmpty | Chars | CharsNWhite | WordCount |
|---|---|---|---|---|---|---|
| en_US.blogs | 200.4242 | 899288 | 899288 | 206824382 | 170389539 | 37570839 |
| en_US.twitter | 159.3641 | 2360148 | 2360148 | 162096031 | 134082634 | 30451128 |
| en_US.news | 196.2775 | 1010242 | 1010242 | 203223154 | 169860866 | 34494539 |
Due to the huge volume of data, the following steps are applied to a sample of the data (2000 lines of each file)
set.seed(5296)
smpl_data = c(blogs[sample(1:length(blogs), 2000, replace=FALSE)],
twitter[sample(1:length(twitter), 2000, replace=FALSE)],
news[sample(1:length(news), 2000, replace=FALSE)]
)
rm(blogs); rm(twitter); rm(news)
build_corpus <- function (x = sampleData) {
sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
sample_c <- tm_map(sample_c, tolower) # all lowercase
sample_c <- tm_map(sample_c, removePunctuation) # Eliminate punctuation
sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
sample_c <- tm_map(sample_c, stemDocument) # Stem the document
sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}
corpus = build_corpus(smpl_data)
dtm <- DocumentTermMatrix(corpus)
rm(smpl_data)
dtm
## <<DocumentTermMatrix (documents: 6000, terms: 15326)>>
## Non-/sparse entries: 85567/91870433
## Sparsity : 100%
## Maximal term length: 47
## Weighting : term frequency (tf)
In the sample, there are 15,326 terms. We will now summarize these terms by producing:
dtm_m <- as.matrix(dtm)
fr_tb <- cbind(
colnames(dtm_m),
colSums(dtm_m)
) %>% as.data.frame()
rownames(fr_tb) <- NULL
colnames(fr_tb) <- c("word", "freq")
fr_tb$freq <- as.numeric(fr_tb$freq)
fr_tb <- fr_tb[order(fr_tb$freq, decreasing = T),]
fr_tb$word <- fct_reorder(fr_tb$word, fr_tb$freq, .desc = TRUE)
p = ggplot(data = fr_tb[1:10,]) + geom_col(aes(x = freq, y = word))
p
wordcloud::wordcloud(
word = fr_tb$word,
freq = fr_tb$freq,
colors=brewer.pal(8, 'Paired'),
min.freq = 1,
max.words=100,
random.order=FALSE,
use.r.layout = TRUE)
Tokenize and Calculate Frequencies of N-Grams
getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
#create term-document matrix tokenized on n-grams
tokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
#find the top term grams with a minimum of occurrence in the corpus
top_terms <- findFreqTerms(tdm,lowfreq)
top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
top_terms_freq$word <- fct_reorder(top_terms_freq$word, top_terms_freq$frequency, .desc = TRUE)
top_terms_freq <- arrange(top_terms_freq, desc(frequency))
}
nGram.Data <- list(2)
for (i in 2:3) {
nGram.Data[[i-1]] <- getTermTable(corpus, ngrams = i, lowfreq = 5)
}
p1 = ggplot(data = nGram.Data[[1]][1:10,]) + geom_col(aes(x = frequency, y = word)) +
labs(title = 'Bigrams')
p2 = ggplot(data = nGram.Data[[2]]) + geom_col(aes(x = frequency, y = word)) +
labs(title = 'Trigrams')
ggarrange(p1, p2, ncol = 1)
wordcloud::wordcloud(
word = nGram.Data[[1]]$word,
freq = nGram.Data[[1]]$frequency,
colors=brewer.pal(8, 'Paired'),
min.freq = 1,
max.words=100,
random.order=FALSE,
use.r.layout = TRUE)
#### Trigrams
wordcloud::wordcloud(
word = nGram.Data[[2]]$word,
freq = nGram.Data[[2]]$frequency,
colors=brewer.pal(8, 'Paired'),
min.freq = 1,
max.words=100,
random.order=FALSE,
use.r.layout = TRUE)
The next steps will be to build a predictive algorithm that uses an n-gram model. This algorithm will then be deployed in a Shiny app and will suggest the most likely next word after a phrase is typed.