Milestone Report

Load the required libraries

I will use the quanteda package because it is more efficient, can work with multithreading and all matrices are sparse by default.

library(tm)
library(readtext)
library(spacyr)
library(stopwords)
library(quanteda)
library(ggplot2)
library(quanteda.textplots)
library(quanteda.textstats)
library(quanteda.dictionaries)
quanteda_options("threads"=10)

Load and examine the size of the data

First, we inspect the size of the 3 files in Mb:

blogs_size <- round(file.size("en_US.blogs.txt")/1024^2,1)
news_size <- round(file.size("en_US.news.txt")/1024^2,1)
twitter_size <- round(file.size("en_US.twitter.txt")/1024^2,1)
file_sizes <- setNames(c(blogs_size, news_size, twitter_size), c("Blogs", "News", "Twitter"))
file_sizes

##   Blogs    News Twitter 
##   200.4   196.3   159.4

Perform basic summaries

Line counts:

blogs <- readLines(file("en_US.blogs.txt","r"), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(file("en_US.news.txt","r"), encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(file("en_US.twitter.txt","r"), encoding = "UTF-8", skipNul = TRUE)
line_counts <- setNames(c(length(blogs),length(news),length(twitter)), c("Blogs", "News", "Twitter"))
line_counts

##   Blogs    News Twitter 
##  899288 1010242 2360148

Character counts:

char_counts <- setNames(c(sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter))), c("Blogs", "News", "Twitter"))
char_counts

##     Blogs      News   Twitter 
## 206824505 203223159 162096241

Average characters per line:

char_average <- setNames(c(round(mean(nchar(blogs)),0),round(mean(nchar(news)),0),round(mean(nchar(twitter)),0)), c("Blogs", "News", "Twitter"))
char_average

##   Blogs    News Twitter 
##     230     201      69

Word counts:

word_blogs <-lengths(strsplit(blogs, " "))
word_news <- lengths(strsplit(news, " "))
word_twitter <- lengths(strsplit(twitter, " "))
word_counts <- setNames(c(sum(word_blogs), sum(word_news), sum(word_twitter)), c("Blogs", "News", "Twitter"))
word_counts

##    Blogs     News  Twitter 
## 37334131 34372530 30373583

Average words per line:

word_average <- setNames(c(round(mean(word_blogs),0),round(mean(word_news),0),round(mean(word_twitter),0)), c("Blogs", "News", "Twitter"))
word_average

##   Blogs    News Twitter 
##      42      34      13

Sample the data

In order to tokenize and analyze the data further we will sample the data to analyze only 8000 lines for each set (approximately 10% of the data). The summary of that reduced corpus is (Types are number of unique tokens):

set.seed(3333)
blogs_sample <- sample(blogs, size=8000, replace = FALSE)
set.seed(3333)
news_sample <- sample(news, size=8000, replace = FALSE)
set.seed(3333)
twitter_sample <- sample(twitter, size=8000, replace = FALSE)
write.table(blogs_sample, file="Sampling/blogs_sample.txt",sep=" ")
write.table(news_sample, file="Sampling/news_sample.txt",sep=" ")
write.table(twitter_sample, file="Sampling/twitter_sample.txt",sep=" ")
#Generate a corpus with the sampled data:
tm_corpus <- VCorpus(DirSource(directory = "Sampling", pattern = "*.txt"))
q_corpus <- corpus(tm_corpus)
docvars(q_corpus, field = "TextId") <- c("Blogs","News", "Twitter")
summary(q_corpus)[,c("TextId", "Types", "Tokens", "Sentences", "language")]

##    TextId Types Tokens Sentences language
## 1   Blogs 41346 416396     18258       en
## 2    News 41021 361075     14824       en
## 3 Twitter 25204 165669      8918       en

Exploratory Analysis

Perform an exploratory analysis to understand the words and frequency we have. First, we generate a list of the top terms by removing punctuation, numbers and stop words. Stop words are articles, conjunctions, prepositions, pronouns, and common verbs that do not have much meaning.

topTerms <- tokens_remove(tokens_tolower(tokens(q_corpus, 
                                               remove_punct = TRUE,
                                               remove_symbols = TRUE,
                                               remove_numbers = TRUE,
                                               remove_url = TRUE,
                                               remove_separators = TRUE,)),
                           pattern = stopwords("english"))

topTerms_DFM <- dfm(topTerms)
topTerms_Top20 <- as.data.frame(topfeatures(topTerms_DFM, n=20))

topTerms_Top20$Words <- rownames(topTerms_Top20)
colnames(topTerms_Top20) <- c("Frequency", "Words")
topTerms_Top20$Words <- as.factor(topTerms_Top20$Words)
topTerms_Top20$Words <- reorder(topTerms_Top20$Words,topTerms_Top20$Frequency)
topTerms_Top20Plot <- ggplot(data=topTerms_Top20, aes(x=Frequency, y=Words))+
    geom_col() +
    ggtitle("Top 20 most frequent words")
topTerms_Top20Plot

We can also visualize the top terms with a word cloud:

textplot_wordcloud(topTerms_DFM,
                   min_count = 100,
                   color = topo.colors(20))

Generate the N-Grams

In order to prepare for the prediction of the next word, we have to generate N-Grams, which are sequences or words that appear together in our texts. The stopwords are not removed since they are essential for predicting the next word.

The two-grams and the 20 most frequently found:

twoGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus, 
                                  remove_punct = TRUE,
                                remove_symbols = TRUE,
                                remove_numbers = TRUE,
                                remove_url = TRUE,
                                remove_separators = TRUE,)), n=2, concatenator = " ")
twoGram_DFM <- dfm(twoGram)
twoGram_Top20 <- as.data.frame(topfeatures(twoGram_DFM, n=20))

twoGram_Top20$Words <- rownames(twoGram_Top20)
colnames(twoGram_Top20) <- c("Frequency", "Words")
twoGram_Top20$Words <- as.factor(twoGram_Top20$Words)
twoGram_Top20$Words <- reorder(twoGram_Top20$Words,twoGram_Top20$Frequency)
twoGram_Top20Plot <- ggplot(data=twoGram_Top20, aes(x=Frequency, y=Words))+
    geom_col() +
    ggtitle("Top 20 most frequent 2-Grams")
twoGram_Top20Plot

The three-grams and the 20 most frequently found:

threeGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus, 
                                  remove_punct = TRUE,
                                remove_symbols = TRUE,
                                remove_numbers = TRUE,
                                remove_url = TRUE,
                                remove_separators = TRUE,)), n=3, concatenator = " ")
threeGram_DFM <- dfm(threeGram)
threeGram_Top20 <- as.data.frame(topfeatures(threeGram_DFM, n=20))

threeGram_Top20$Words <- rownames(threeGram_Top20)
colnames(threeGram_Top20) <- c("Frequency", "Words")
threeGram_Top20$Words <- as.factor(threeGram_Top20$Words)
threeGram_Top20$Words <- reorder(threeGram_Top20$Words,threeGram_Top20$Frequency)
threeGram_Top20Plot <- ggplot(data=threeGram_Top20, aes(x=Frequency, y=Words))+
    geom_col() +
    ggtitle("Top 20 most frequent 3-Grams")
threeGram_Top20Plot

The four-grams and the 20 most frequently found:

fourGram <- tokens_ngrams(tokens_tolower(tokens(q_corpus, 
                                  remove_punct = TRUE,
                                remove_symbols = TRUE,
                                remove_numbers = TRUE,
                                remove_url = TRUE,
                                remove_separators = TRUE,)), n=4, concatenator = " ")
fourGram_DFM <- dfm(fourGram)
fourGram_Top20 <- as.data.frame(topfeatures(fourGram_DFM, n=20))

fourGram_Top20$Words <- rownames(fourGram_Top20)
colnames(fourGram_Top20) <- c("Frequency", "Words")
fourGram_Top20$Words <- as.factor(fourGram_Top20$Words)
fourGram_Top20$Words <- reorder(fourGram_Top20$Words,fourGram_Top20$Frequency)
fourGram_Top20Plot <- ggplot(data=fourGram_Top20, aes(x=Frequency, y=Words))+
    geom_col() +
    ggtitle("Top 20 most frequent 4-Grams")
fourGram_Top20Plot

Conclusions and next steps:

The data provided is too big to analyze in a normal computer. Once I interrogated the type of data and their size, I randomly took only 10% of the data to further analyze. The sampled data was tokenized and analyzed to get the most frequent words. Afterwards, a series of n-grams lists were generated to use later in the prediction algorithm. For visualization a bargraph with the 20 most frequent words, 2-grams, 3-grams and 4-grams were shown. The next step is to generate a prediction algorithm and the shiny app.