The aim of this report is to explain our exploratory analysis and our goals for the eventual app and algorithm.

Load relevant packages

library(stringi, quietly = TRUE)
library(knitr, quietly = TRUE)
library(tm, quietly = TRUE)
library(SnowballC, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(wordcloud, quietly = TRUE)
library(ggplot2, quietly = TRUE)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(ggpubr, quietly = TRUE)
library(dplyr, quietly = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats, quietly = TRUE)
library(RWeka, quietly = TRUE)

Basic summary of the 3 datasets

Read the files from the working directory and summarize file size, word counts, line counts properties

path1 = 'final/en_US/en_US.blogs.txt'
path2 = 'final/en_US/en_US.twitter.txt'
path3 = 'final/en_US/en_US.news.txt'

con = file(path1, open = 'rb')
blogs = readLines(con, encoding = 'UTF-8')
close(con)

con = file(path2, open = 'rb')
twitter = readLines(con, encoding = 'UTF-8')
close(con)

con = file(path3, open = 'rb')
news = readLines(con, encoding = 'UTF-8')
close(con)

table_stats <- data.frame(
   fileName = c("en_US.blogs",
                "en_US.twitter",
                "en_US.news"),
   fileSize = c(file.info(path1)$size/1024^2,
                file.info(path2)$size/1024^2,
                file.info(path3)$size/1024^2),
   t(rbind(sapply(list(blogs, twitter, news), stri_stats_general),
           WordCount = sapply(list(blogs, twitter, news), stri_stats_latex)[4,]))
)
kable(table_stats)
fileName fileSize Lines LinesNEmpty Chars CharsNWhite WordCount
en_US.blogs 200.4242 899288 899288 206824382 170389539 37570839
en_US.twitter 159.3641 2360148 2360148 162096031 134082634 30451128
en_US.news 196.2775 1010242 1010242 203223154 169860866 34494539

Data cleaning

Due to the huge volume of data, the following steps are applied to a sample of the data (2000 lines of each file)

set.seed(5296)

smpl_data = c(blogs[sample(1:length(blogs), 2000, replace=FALSE)],
              twitter[sample(1:length(twitter), 2000, replace=FALSE)],
              news[sample(1:length(news), 2000, replace=FALSE)]
)

rm(blogs); rm(twitter); rm(news)


build_corpus <- function (x = sampleData) {
   sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
   sample_c <- tm_map(sample_c, tolower) # all lowercase
   sample_c <- tm_map(sample_c, removePunctuation) # Eliminate punctuation
   sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
   sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
   sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
   sample_c <- tm_map(sample_c, stemDocument) # Stem the document
   sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}

corpus = build_corpus(smpl_data)
dtm <- DocumentTermMatrix(corpus)

rm(smpl_data)

dtm
## <<DocumentTermMatrix (documents: 6000, terms: 15326)>>
## Non-/sparse entries: 85567/91870433
## Sparsity           : 100%
## Maximal term length: 47
## Weighting          : term frequency (tf)

In the sample, there are 15,326 terms. We will now summarize these terms by producing:

Distribution of word frequencies

Barplots

dtm_m <- as.matrix(dtm)
fr_tb <- cbind(
   colnames(dtm_m),
   colSums(dtm_m)
) %>% as.data.frame()
rownames(fr_tb) <- NULL
colnames(fr_tb) <- c("word", "freq")
fr_tb$freq <- as.numeric(fr_tb$freq)
fr_tb <- fr_tb[order(fr_tb$freq, decreasing = T),]
fr_tb$word <- fct_reorder(fr_tb$word, fr_tb$freq, .desc = TRUE)



p = ggplot(data = fr_tb[1:10,]) + geom_col(aes(x = freq, y = word)) 
p

Word clouds

wordcloud::wordcloud(
   word = fr_tb$word, 
   freq = fr_tb$freq,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100, 
   random.order=FALSE,
   use.r.layout = TRUE)

Frequencies of bigrams and trigrams

Tokenize and Calculate Frequencies of N-Grams

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
   #create term-document matrix tokenized on n-grams
   tokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
   tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
   #find the top term grams with a minimum of occurrence in the corpus
   top_terms <- findFreqTerms(tdm,lowfreq)
   top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
   top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
   top_terms_freq$word <- fct_reorder(top_terms_freq$word, top_terms_freq$frequency, .desc = TRUE)
   top_terms_freq <- arrange(top_terms_freq, desc(frequency))
   
}

nGram.Data <- list(2)
for (i in 2:3) {
   nGram.Data[[i-1]] <- getTermTable(corpus, ngrams = i, lowfreq = 5)
}

Barplot of N-grams frequencies

p1 = ggplot(data = nGram.Data[[1]][1:10,]) + geom_col(aes(x = frequency, y = word)) +
   labs(title =  'Bigrams')
p2 = ggplot(data = nGram.Data[[2]]) + geom_col(aes(x = frequency, y = word)) +
   labs(title =  'Trigrams')
ggarrange(p1, p2, ncol = 1)

Word clouds of N-grams

Bigrams

wordcloud::wordcloud(
   word = nGram.Data[[1]]$word,
   freq = nGram.Data[[1]]$frequency,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100,
   random.order=FALSE,
   use.r.layout = TRUE)

#### Trigrams

wordcloud::wordcloud(
   word = nGram.Data[[2]]$word,
   freq = nGram.Data[[2]]$frequency,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100,
   random.order=FALSE,
   use.r.layout = TRUE)

Next steps

The next steps will be to build a predictive algorithm that uses an n-gram model. This algorithm will then be deployed in a Shiny app and will suggest the most likely next word after a phrase is typed.