Exploratory Analyses of English Datasets

Summary

The goal of this report is to investigate the distribution and relationship between the words, tokens, and phrases in a set of files, containing text from news, tweets and blogs. Basic relationships, observed in the data, are presented. These steps are the first ones to take in order to prepare to build a linguistic model.

Corpus

As I first step we load the data from the files en_US.twitter.txt, en_US.blogs.txt and en_US.news.txt and draw a 20% sample.

setCorpus()
Corpus <- getCorpusSample()
kable(head(Corpus), caption = "Corpus data")

Corpus data
text	type
Now, I’m off to work on an interpretation of animus.	blog
u should have shared !!	tweet
Here are a few smoothie recipes with runners in mind:	blog
The deal also would resolve controversial GOP demands to restrict abortion and environmental regulation, but officials did not detail how those issues would be solved.	news
Please Skip. What else do you expect these guys to say? That Tebow isn’t good? It’s spin Skip. SPIN	tweet
how long are you there? I arrive Friday.	tweet

What we get is Corpus with size 166.8 Mb, containing 853,936 lines. As a next step we form all possible combinations of one, two and three contiguous sequence of items from the given lines. These are called n-grams. At this stage we remove any sequences that are obviously not contributing to our text prediction purposes like punctuation, alphanumeric words, non-english words and so on.

setUnigrams()
setBigrams()
setTrigrams()

Number of lines and the respective distinct n-grams counts for each source type are shown on the following table:

kable(getCorpusStats(), format.args = list(big.mark = ","), caption = "Corpus stats")

Corpus stats
type	nbr_of_lines	nbr_of_distinct_unigrams	nbr_of_distinct_bigrams	nbr_of_distinct_trigrams
blog	179,914	137,840	2,089,300	5,138,383
news	202,643	135,369	2,162,073	5,025,137
tweet	471,379	143,028	1,846,649	4,249,542

N-grams frequences

Uni-grams

Top 20 most frequent uni-grams are shown below:

plotUnigram()

Bi-grams

A network of top 60 most frequent bi-grams:

chainBigram()

Transparency of the links represents how often or rare a bi-gram is.

Tri-grams

Top 20 most frequent tri-grams are shown below:

plotTrigram()

Model thoughts

In our text prediction algorithm we will propose a next word by checking in sequence tri-grams, bi-grams and uni-grams, depending on how many words are entered from the user. If no match is found - most frequent unigram will be proposed.

Appendix - list of code used

Libraries

library(tidyverse)
library(stringr)
library(tidytext)
library(knitr)
library(scales)
library(igraph)
library(ggraph)

Function bigmarkformat - used for display of x-axis values

bigmarkformat <- function(x){
        format(x, big.mark = ",")
}

Function setCorpus() - sets the Corpus object

setCorpus <- function(){
        
        file_tweets <- "final\\en_US\\en_US.twitter.txt"
        file_blogs <- "final\\en_US\\en_US.blogs.txt"
        file_news <- "final\\en_US\\en_US.news.txt"
        
        tb_tweets <- tibble(text = read_lines(file_tweets, 
                                              skip = 0, 
                                              n_max = -1, 
                                              locale = default_locale(), 
                                              na = character(), 
                                              progress = interactive()),
                            type = "tweet")
        
        tb_blogs <- tibble(text = read_lines(file_blogs, 
                                             skip = 0, 
                                             n_max = -1, 
                                             locale = default_locale(), 
                                             na = character(), 
                                             progress = interactive()),
                           type = "blog")
        
        tb_news <- tibble(text = read_lines(file_news, 
                                            skip = 0, 
                                            n_max = -1, 
                                            locale = default_locale(), 
                                            na = character(), 
                                            progress = interactive()),
                          type = "news")
        
        tb_allin <- rbind(tb_tweets, tb_news, tb_blogs)
        write_rds(tb_allin, "data_obj\\tb_allin.rds")
}

Function getCorpus() - returns the Corpus object

getCorpus <- function(){
 read_rds("data_obj\\tb_allin.rds")       
}

Function getCorpusSample() - returns a sample of the Corpus object

getCorpusSample <- function(){
        set.seed(6320)
        read_rds("data_obj\\tb_allin.rds") %>%
                sample_frac(0.2)
}

Function setUnigrams() - sets the unigrams

setUnigrams <- function(){
        tb_unigrams <- getCorpusSample() %>%
                unnest_tokens(word, text) %>%
                filter(str_detect(word, "[a-z]+|[0-9]+"))
        write_rds(tb_unigrams, "data_obj\\tb_unigrams_clean.rds")
}

Function getUnigrams() - gets the unigrams

getUnigrams <- function(){
        read_rds("data_obj\\tb_unigrams_clean.rds")
}

Function setBigrams() - sets the bigrams

setBigrams <- function(){
        tb_bigrams <- getCorpusSample() %>%
                unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
                separate(bigram, c("word1", "word2"), sep = " ") %>%
                filter(
                        str_detect(word1, "[a-z]+|[0-9]+"),
                        str_detect(word2, "[a-z]+|[0-9]+")
                        ) %>%
                unite(bigram, word1, word2, sep = " ")
        write_rds(tb_bigrams, "data_obj\\tb_bigrams_clean.rds")
}

Function getBigrams() - gets the bigrams

getBigrams <- function(){
        read_rds("data_obj\\tb_bigrams_clean.rds")
}

Function setTrigrams() - sets the trigrams

setTrigrams <- function(){
        tb_trigrams <- getCorpusSample() %>%
                unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
                separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
                filter(
                        str_detect(word1, "[a-z]+|[0-9]+"),
                        str_detect(word2, "[a-z]+|[0-9]+"),
                        str_detect(word3, "[a-z]+|[0-9]+")
                ) %>%
                unite(trigram, word1, word2, word3, sep = " ")
                write_rds(tb_trigrams, "data_obj\\tb_trigrams_clean.rds")
}

Function getTrigrams() - gets the trigrams

getTrigrams <- function(){
        read_rds("data_obj\\tb_trigrams_clean.rds")
}

Function getCorpusStats() - gets the stats of the sample Corpus object

getCorpusStats <- function(){
        
        tb_Corpus_stats <- getCorpusSample() %>%
                group_by(type) %>%
                summarize(nbr_of_lines = n())
        tb_Corpus_stats <- bind_cols(
                tb_Corpus_stats,
                getUnigrams() %>%
                        group_by(type) %>%
                        summarize(nbr_of_distinct_unigrams = n_distinct(word)) %>%
                        select(nbr_of_distinct_unigrams),
                getBigrams() %>%
                        group_by(type) %>%
                        summarize(nbr_of_distinct_bigrams = n_distinct(bigram)) %>%
                        select(nbr_of_distinct_bigrams),
                getTrigrams() %>%
                        group_by(type) %>%
                        summarize(nbr_of_distinct_trigrams = n_distinct(trigram)) %>%
                        select(nbr_of_distinct_trigrams)
        )
        tb_Corpus_stats
}

Function plotUnigram() - plots the most frequent unigrams

plotUnigram <-  function(){
        plot <-  getUnigrams() %>%
                count(word, sort = TRUE) %>%
                top_n(20) %>%
                mutate(gram = reorder(word, n)) %>%
                mutate(type = "uni-gram") %>%
                select(n, gram, type) %>%
                
                ggplot(aes(gram, n)) +
                geom_col(show.legend = FALSE, fill = "blue") +
                xlab(NULL) +
                coord_flip() + 
                scale_y_continuous(labels=bigmarkformat) +
                theme(axis.text.x  = element_text(angle=45, vjust=0.7, size=10, hjust = 0.9))
        plot
}

Function chainBigram() - visualizing bigrams network

chainBigram <- function(){
        bigram_graph <- getBigrams() %>%
                separate(bigram, c("word1", "word2"), sep = " ") %>%
                count(word1, word2, sort = TRUE) %>%
                top_n(60) %>%
                graph_from_data_frame()
        
        set.seed(6320)
        
        a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
        
        plot <- ggraph(bigram_graph, layout = "fr") +
                geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                               arrow = a, end_cap = circle(.07, 'inches')) +
                geom_node_point(color = "lightblue", size = 5) +
                geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
                theme_void()
        plot
}

Function plotTrigram() - plots the most frequent trigrams

plotTrigram <-  function(){
        plot <-  getTrigrams() %>%
                count(trigram, sort = TRUE) %>%
                top_n(20) %>%
                mutate(gram = reorder(trigram, n)) %>%
                mutate(type = "tri-gram") %>%
                select(n, gram, type) %>%
                
                ggplot(aes(gram, n)) +
                geom_col(show.legend = FALSE, fill = "green") +
                xlab(NULL) +
                coord_flip() + 
                scale_y_continuous(labels=bigmarkformat) +
                theme(axis.text.x  = element_text(angle=45, vjust=0.7, size=10, hjust = 0.9))
        plot
}