The goal of this report is to investigate the distribution and relationship between the words, tokens, and phrases in a set of files, containing text from news, tweets and blogs. Basic relationships, observed in the data, are presented. These steps are the first ones to take in order to prepare to build a linguistic model.
As I first step we load the data from the files en_US.twitter.txt, en_US.blogs.txt and en_US.news.txt and draw a 20% sample.
setCorpus()
Corpus <- getCorpusSample()
kable(head(Corpus), caption = "Corpus data")
| text | type |
|---|---|
| Now, I’m off to work on an interpretation of animus. | blog |
| u should have shared !! | tweet |
| Here are a few smoothie recipes with runners in mind: | blog |
| The deal also would resolve controversial GOP demands to restrict abortion and environmental regulation, but officials did not detail how those issues would be solved. | news |
| Please Skip. What else do you expect these guys to say? That Tebow isn’t good? It’s spin Skip. SPIN | tweet |
| how long are you there? I arrive Friday. | tweet |
What we get is Corpus with size 166.8 Mb, containing 853,936 lines. As a next step we form all possible combinations of one, two and three contiguous sequence of items from the given lines. These are called n-grams. At this stage we remove any sequences that are obviously not contributing to our text prediction purposes like punctuation, alphanumeric words, non-english words and so on.
setUnigrams()
setBigrams()
setTrigrams()
Number of lines and the respective distinct n-grams counts for each source type are shown on the following table:
kable(getCorpusStats(), format.args = list(big.mark = ","), caption = "Corpus stats")
| type | nbr_of_lines | nbr_of_distinct_unigrams | nbr_of_distinct_bigrams | nbr_of_distinct_trigrams |
|---|---|---|---|---|
| blog | 179,914 | 137,840 | 2,089,300 | 5,138,383 |
| news | 202,643 | 135,369 | 2,162,073 | 5,025,137 |
| tweet | 471,379 | 143,028 | 1,846,649 | 4,249,542 |
Top 20 most frequent uni-grams are shown below:
plotUnigram()
A network of top 60 most frequent bi-grams:
chainBigram()
Transparency of the links represents how often or rare a bi-gram is.
Top 20 most frequent tri-grams are shown below:
plotTrigram()
In our text prediction algorithm we will propose a next word by checking in sequence tri-grams, bi-grams and uni-grams, depending on how many words are entered from the user. If no match is found - most frequent unigram will be proposed.
Libraries
library(tidyverse)
library(stringr)
library(tidytext)
library(knitr)
library(scales)
library(igraph)
library(ggraph)
Function bigmarkformat - used for display of x-axis values
bigmarkformat <- function(x){
format(x, big.mark = ",")
}
Function setCorpus() - sets the Corpus object
setCorpus <- function(){
file_tweets <- "final\\en_US\\en_US.twitter.txt"
file_blogs <- "final\\en_US\\en_US.blogs.txt"
file_news <- "final\\en_US\\en_US.news.txt"
tb_tweets <- tibble(text = read_lines(file_tweets,
skip = 0,
n_max = -1,
locale = default_locale(),
na = character(),
progress = interactive()),
type = "tweet")
tb_blogs <- tibble(text = read_lines(file_blogs,
skip = 0,
n_max = -1,
locale = default_locale(),
na = character(),
progress = interactive()),
type = "blog")
tb_news <- tibble(text = read_lines(file_news,
skip = 0,
n_max = -1,
locale = default_locale(),
na = character(),
progress = interactive()),
type = "news")
tb_allin <- rbind(tb_tweets, tb_news, tb_blogs)
write_rds(tb_allin, "data_obj\\tb_allin.rds")
}
Function getCorpus() - returns the Corpus object
getCorpus <- function(){
read_rds("data_obj\\tb_allin.rds")
}
Function getCorpusSample() - returns a sample of the Corpus object
getCorpusSample <- function(){
set.seed(6320)
read_rds("data_obj\\tb_allin.rds") %>%
sample_frac(0.2)
}
Function setUnigrams() - sets the unigrams
setUnigrams <- function(){
tb_unigrams <- getCorpusSample() %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z]+|[0-9]+"))
write_rds(tb_unigrams, "data_obj\\tb_unigrams_clean.rds")
}
Function getUnigrams() - gets the unigrams
getUnigrams <- function(){
read_rds("data_obj\\tb_unigrams_clean.rds")
}
Function setBigrams() - sets the bigrams
setBigrams <- function(){
tb_bigrams <- getCorpusSample() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(
str_detect(word1, "[a-z]+|[0-9]+"),
str_detect(word2, "[a-z]+|[0-9]+")
) %>%
unite(bigram, word1, word2, sep = " ")
write_rds(tb_bigrams, "data_obj\\tb_bigrams_clean.rds")
}
Function getBigrams() - gets the bigrams
getBigrams <- function(){
read_rds("data_obj\\tb_bigrams_clean.rds")
}
Function setTrigrams() - sets the trigrams
setTrigrams <- function(){
tb_trigrams <- getCorpusSample() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(
str_detect(word1, "[a-z]+|[0-9]+"),
str_detect(word2, "[a-z]+|[0-9]+"),
str_detect(word3, "[a-z]+|[0-9]+")
) %>%
unite(trigram, word1, word2, word3, sep = " ")
write_rds(tb_trigrams, "data_obj\\tb_trigrams_clean.rds")
}
Function getTrigrams() - gets the trigrams
getTrigrams <- function(){
read_rds("data_obj\\tb_trigrams_clean.rds")
}
Function getCorpusStats() - gets the stats of the sample Corpus object
getCorpusStats <- function(){
tb_Corpus_stats <- getCorpusSample() %>%
group_by(type) %>%
summarize(nbr_of_lines = n())
tb_Corpus_stats <- bind_cols(
tb_Corpus_stats,
getUnigrams() %>%
group_by(type) %>%
summarize(nbr_of_distinct_unigrams = n_distinct(word)) %>%
select(nbr_of_distinct_unigrams),
getBigrams() %>%
group_by(type) %>%
summarize(nbr_of_distinct_bigrams = n_distinct(bigram)) %>%
select(nbr_of_distinct_bigrams),
getTrigrams() %>%
group_by(type) %>%
summarize(nbr_of_distinct_trigrams = n_distinct(trigram)) %>%
select(nbr_of_distinct_trigrams)
)
tb_Corpus_stats
}
Function plotUnigram() - plots the most frequent unigrams
plotUnigram <- function(){
plot <- getUnigrams() %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(gram = reorder(word, n)) %>%
mutate(type = "uni-gram") %>%
select(n, gram, type) %>%
ggplot(aes(gram, n)) +
geom_col(show.legend = FALSE, fill = "blue") +
xlab(NULL) +
coord_flip() +
scale_y_continuous(labels=bigmarkformat) +
theme(axis.text.x = element_text(angle=45, vjust=0.7, size=10, hjust = 0.9))
plot
}
Function chainBigram() - visualizing bigrams network
chainBigram <- function(){
bigram_graph <- getBigrams() %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
count(word1, word2, sort = TRUE) %>%
top_n(60) %>%
graph_from_data_frame()
set.seed(6320)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
plot <- ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
plot
}
Function plotTrigram() - plots the most frequent trigrams
plotTrigram <- function(){
plot <- getTrigrams() %>%
count(trigram, sort = TRUE) %>%
top_n(20) %>%
mutate(gram = reorder(trigram, n)) %>%
mutate(type = "tri-gram") %>%
select(n, gram, type) %>%
ggplot(aes(gram, n)) +
geom_col(show.legend = FALSE, fill = "green") +
xlab(NULL) +
coord_flip() +
scale_y_continuous(labels=bigmarkformat) +
theme(axis.text.x = element_text(angle=45, vjust=0.7, size=10, hjust = 0.9))
plot
}