1 Loading the Data

We load the libraries required for the analysis…

library(dplyr); library(ggplot2); library(ggsci); library(magrittr); library(plotly); library(stringr); library(tidytext)

… and define new functions: one that reads the text file into R and appropriately encodes it (textToVector), and one that counts rows and words in rows (tidy.corpus).

text.to.vector <- function(path) {
    # read corpus into R from text file
    
    require(magrittr)
    
    path %>%
        file() %T>%
        {. ->> connection} %>%
        readLines(encoding = "UTF-8") %>%
        iconv(from = "UTF-8", to = 'latin1//TRANSLIT', sub = "") %>%    # change encoding
        {. ->> textOut}
    close(connection)
    textOut
}


tidy.corpus <- function(corpus_text) {
    # count the most common words in the corpus
    # and mark line numbers
    
    require(dplyr)
    require(tidytext)
    
    label <- deparse(substitute(corpus_text))              # mark book in the final data frame by the book label
    wordcount <- stringr::str_count(corpus_text, "\\s+")   # counts words per row of the imported book
    
    corpus_text %>%
        dplyr::data_frame(text = .) %>% 
        mutate(linenumber = row_number()) %>%           # remember row numbers (e.g. chapters)
        mutate(corpus = label)  %>%                     # remember book name
        mutate(wordcount = wordcount)                   # remember word count

}

With these functions, we load the data into R and convert them into ‘tidy’ format, compatible with tidyverse packages.

en_twitter <- text.to.vector("D:\\RStudio\\10-Capstone\\final\\en_US\\en_US.twitter.txt")
en_blogs <- text.to.vector("D:\\RStudio\\10-Capstone\\final\\en_US\\en_US.blogs.txt")
en_news <- text.to.vector("D:\\RStudio\\10-Capstone\\final\\en_US\\en_US.news.txt")

en_blogs_tidy <- tidy.corpus(en_blogs)
en_news_tidy <- tidy.corpus(en_news)
en_twitter_tidy <- tidy.corpus(en_twitter)

en_tidy <- dplyr::bind_rows(en_blogs_tidy,
                            en_news_tidy,
                            en_twitter_tidy)

2 Summarising the corpora

We perform basic summary which shows that Twitter corpus has the most entries. As expected, twits are very short as Twitter limits their length (they average 11 words). Blog corpus contains the longest entries and has the most words, although the length of individual entries in blog corpus is similar to the length of entries in news corpus.

options(dplyr.width = Inf)

en_tidy %>% 
    group_by(corpus) %>% 
    summarise(lines = max(linenumber), 
              total_words = sum(wordcount), 
              mean_words = mean(wordcount), 
              min_words = min(wordcount), 
              quantile_1st_words = quantile(wordcount, probs = 0.25), 
              median = quantile(wordcount, probs = 0.5),
              quantile_3rd_words = quantile(wordcount, probs = 0.75),
              max_words = max(wordcount))
## # A tibble: 3 x 9
##   corpus       lines total_words mean_words min_words quantile_1st_words
##   <chr>        <dbl>       <int>      <dbl>     <dbl>              <dbl>
## 1 en_blogs    899288    36434843       40.5         0                  8
## 2 en_news      77259     2566710       33.2         0                 18
## 3 en_twitter 2360148    28011566       11.9         0                  6
##   median quantile_3rd_words max_words
##    <dbl>              <dbl>     <dbl>
## 1     27                 58      6629
## 2     30                 44      1030
## 3     11                 17        46

We use violin plot to plot word count per row for each corpus. Black lines correspond to 1st quantile, median, and 3rd quantile. Note that y-axis is on the log scale. We see that blog corpus contains the most outliers but is otherwise very similar to the news corpus.

en_tidy %>% 
    ggplot(aes(x = factor(corpus), y = wordcount, fill = corpus)) +
    geom_violin(scale = "width", show.legend = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
    scale_y_log10() +
    annotation_logticks(base = 10, sides = "l", scaled = TRUE) +
    ggsci::scale_fill_npg() +
    labs(x = "Corpus", y ="Wordcount") +
    theme_classic()

Word count histogram confirms these observations.

en_tidy %>% 
{ggplot2::ggplot(., aes(x = wordcount, fill = corpus)) +
        geom_histogram(position = "identity", alpha = 0.5, binwidth = 20) +
        theme_classic() +
        scale_y_log10(limits = c(NA, 10^6.2)) +
        xlim(NA, 1200) +
        xlab("Words per entry") +
        ylab("Count") +
        ggsci::scale_fill_npg()} %>%
    plotly::ggplotly(height = 600, width = 972, tooltip = "count")

3 Themes in corpora

Finally, we explore common themes in each corpus, i.e. we look at the most common words, excluding ‘stop words’ (words that occur very frequently in a language, e.g. articles, pronouns, prepositions…). Thus, we define a function that tokenises the corpora on words (using the tidytext package), removes stop words, sorts the remaining ones by the number of times they occur in the corpus, and plots only the most common ones.

plot.common.words <- function(x, number) {
    # plot the most commonly used words in the corpora (without stop words)
    # by plotting each corpus in its own facet
    
    require(dplyr)
    require(ggplot2)
    require(ggsci)
    require(magrittr)
    require(plotly)
    require(tidytext)

    x %>% 
        tidytext::unnest_tokens(word, text) %>%            # tokenize
        dplyr::anti_join(stop_words) %>%
        dplyr::group_by(corpus) %T>%
        {.$corpus %>%             # needed to differentiate between same words in different translations
                unique() %>% 
                dplyr::data_frame(corpus = .) %>% 
                dplyr::mutate(no = dplyr::row_number()) %>% 
                t() %>% 
                magrittr::set_colnames(.[1,]) %>% 
                .[2,] ->> corpus_list} %>%
        dplyr::count(word, sort = TRUE) %>%
        dplyr::top_n(number) %>%
        dplyr::slice(1:number) %>%
        dplyr::ungroup() %>%
        dplyr::mutate(whitespaces = stringr::str_dup(" ", times = corpus_list[corpus])) %>% 
        dplyr::mutate(word = paste0(whitespaces, word)) %>% 
        dplyr::mutate(word = reorder(word, n)) %>% 
        {ggplot2::ggplot(., aes(x = word, fill = corpus, text = paste0(n, ""))) +
                geom_col(aes(y = n), position = "identity") +
                xlab(NULL) +
                coord_flip() +
                ggsci::scale_fill_npg() +
                theme_classic() +
                theme(legend.position = "none") +
                facet_wrap(~ corpus, ncol = 3, scales = "free")} %>% 
        plotly::ggplotly(height = 600, width = 972, tooltip = "text")
}

plot.common.words(en_tidy, number = 20)

While each corpus contains some specific common words (e.g. rt, twitter), some common words repeat in all three corpora (e.g. time, people, day, 2, 3). Thus, the analysis suggests that corpora contain similar text and can be used interchangeably in the future model building.

4 Plans for creating the prediction algorithm

Prediction algorithm will be created mostly using R packages ‘tm’ and ‘quanteda’. Preliminary analysis showed that all three corpora contain text centred on similar topics and so I predict that corpora could, for the most part, be used interchangeably. In the next step, I will perform bigram tokenization. To speed up the analysis, I will discard all bigrams that occur only once. According to Zipf’s law, there should be a lot of them, and so this approach will filter out a large portion of the data that cannot be used for predictive typing.