Data Science Capstone - Milestone report

The aim of this report is to explain our exploratory analysis and our goals for the eventual app and algorithm.

Load relevant packages

library(stringi, quietly = TRUE)
library(knitr, quietly = TRUE)
library(tm, quietly = TRUE)
library(SnowballC, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(wordcloud, quietly = TRUE)
library(ggplot2, quietly = TRUE)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(ggpubr, quietly = TRUE)
library(dplyr, quietly = TRUE)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forcats, quietly = TRUE)
library(RWeka, quietly = TRUE)

Basic summary of the 3 datasets

Read the files from the working directory and summarize file size, word counts, line counts properties

path1 = 'final/en_US/en_US.blogs.txt'
path2 = 'final/en_US/en_US.twitter.txt'
path3 = 'final/en_US/en_US.news.txt'

con = file(path1, open = 'rb')
blogs = readLines(con, encoding = 'UTF-8')
close(con)

con = file(path2, open = 'rb')
twitter = readLines(con, encoding = 'UTF-8')
close(con)

con = file(path3, open = 'rb')
news = readLines(con, encoding = 'UTF-8')
close(con)

table_stats <- data.frame(
   fileName = c("en_US.blogs",
                "en_US.twitter",
                "en_US.news"),
   fileSize = c(file.info(path1)$size/1024^2,
                file.info(path2)$size/1024^2,
                file.info(path3)$size/1024^2),
   t(rbind(sapply(list(blogs, twitter, news), stri_stats_general),
           WordCount = sapply(list(blogs, twitter, news), stri_stats_latex)[4,]))
)
kable(table_stats)

fileName	fileSize	Lines	LinesNEmpty	Chars	CharsNWhite	WordCount
en_US.blogs	200.4242	899288	899288	206824382	170389539	37570839
en_US.twitter	159.3641	2360148	2360148	162096031	134082634	30451128
en_US.news	196.2775	1010242	1010242	203223154	169860866	34494539

Data cleaning

Due to the huge volume of data, the following steps are applied to a sample of the data (2000 lines of each file)

Convert all words to lowercase
Eliminate punctuation
Eliminate numbers
Strip whitespace
Remove stopwords
Stemming (Remove commoner morphological and inflexional endings)

set.seed(5296)

smpl_data = c(blogs[sample(1:length(blogs), 2000, replace=FALSE)],
              twitter[sample(1:length(twitter), 2000, replace=FALSE)],
              news[sample(1:length(news), 2000, replace=FALSE)]
)

rm(blogs); rm(twitter); rm(news)


build_corpus <- function (x = sampleData) {
   sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
   sample_c <- tm_map(sample_c, tolower) # all lowercase
   sample_c <- tm_map(sample_c, removePunctuation) # Eliminate punctuation
   sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
   sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
   sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
   sample_c <- tm_map(sample_c, stemDocument) # Stem the document
   sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}

corpus = build_corpus(smpl_data)
dtm <- DocumentTermMatrix(corpus)

rm(smpl_data)

dtm

## <<DocumentTermMatrix (documents: 6000, terms: 15326)>>
## Non-/sparse entries: 85567/91870433
## Sparsity           : 100%
## Maximal term length: 47
## Weighting          : term frequency (tf)

In the sample, there are 15,326 terms. We will now summarize these terms by producing:

barplot with top 10 frequent words
a wordcloud as visualization of the sample

Distribution of word frequencies

Barplots

dtm_m <- as.matrix(dtm)
fr_tb <- cbind(
   colnames(dtm_m),
   colSums(dtm_m)
) %>% as.data.frame()
rownames(fr_tb) <- NULL
colnames(fr_tb) <- c("word", "freq")
fr_tb$freq <- as.numeric(fr_tb$freq)
fr_tb <- fr_tb[order(fr_tb$freq, decreasing = T),]
fr_tb$word <- fct_reorder(fr_tb$word, fr_tb$freq, .desc = TRUE)



p = ggplot(data = fr_tb[1:10,]) + geom_col(aes(x = freq, y = word)) 
p

Word clouds

wordcloud::wordcloud(
   word = fr_tb$word, 
   freq = fr_tb$freq,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100, 
   random.order=FALSE,
   use.r.layout = TRUE)

Frequencies of bigrams and trigrams

Tokenize and Calculate Frequencies of N-Grams

getTermTable <- function(corpusData, ngrams = 1, lowfreq = 50) {
   #create term-document matrix tokenized on n-grams
   tokenizer <- function(x) {NGramTokenizer(x, Weka_control(min = ngrams, max = ngrams)) }
   tdm <- TermDocumentMatrix(corpusData, control = list(tokenize = tokenizer))
   #find the top term grams with a minimum of occurrence in the corpus
   top_terms <- findFreqTerms(tdm,lowfreq)
   top_terms_freq <- rowSums(as.matrix(tdm[top_terms,]))
   top_terms_freq <- data.frame(word = names(top_terms_freq), frequency = top_terms_freq)
   top_terms_freq$word <- fct_reorder(top_terms_freq$word, top_terms_freq$frequency, .desc = TRUE)
   top_terms_freq <- arrange(top_terms_freq, desc(frequency))
   
}

nGram.Data <- list(2)
for (i in 2:3) {
   nGram.Data[[i-1]] <- getTermTable(corpus, ngrams = i, lowfreq = 5)
}

Barplot of N-grams frequencies

p1 = ggplot(data = nGram.Data[[1]][1:10,]) + geom_col(aes(x = frequency, y = word)) +
   labs(title =  'Bigrams')
p2 = ggplot(data = nGram.Data[[2]]) + geom_col(aes(x = frequency, y = word)) +
   labs(title =  'Trigrams')
ggarrange(p1, p2, ncol = 1)

Word clouds of N-grams

Bigrams

wordcloud::wordcloud(
   word = nGram.Data[[1]]$word,
   freq = nGram.Data[[1]]$frequency,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100,
   random.order=FALSE,
   use.r.layout = TRUE)

#### Trigrams

wordcloud::wordcloud(
   word = nGram.Data[[2]]$word,
   freq = nGram.Data[[2]]$frequency,
   colors=brewer.pal(8, 'Paired'),
   min.freq = 1,
   max.words=100,
   random.order=FALSE,
   use.r.layout = TRUE)

Next steps

The next steps will be to build a predictive algorithm that uses an n-gram model. This algorithm will then be deployed in a Shiny app and will suggest the most likely next word after a phrase is typed.