This document analyses a dataset comprised of samples of the datasets Blogs, News and Tweets. The analysis focused on the creation of n-grams and their frequencies.

library(ggplot2)
library(dplyr)
library(tidytext)
twitter_lines <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
blogs_lines <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
news_lines <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")
## 2360148 lines in the twitter dataset
## 899287 lines in the blogs dataset
## 1010242 lines in the news dataset

We can see that the length of the texts of the texts in the twitter dataset its between 1 and 140 given the length limit on the tweets set by the website. The other two datasets have lengths up to 40.000, but most of the texts are not that long.

hist(nchar(twitter_lines), main="Length frequency Twitter Dataset", xlab="Length")

hist(log10(nchar(blogs_lines)), main="Log Length frequency Blogs Dataset", xlab="Log10 Length")

hist(log10(nchar(news_lines)), main="Log Length frequency News Dataset", xlab="Log10 Length")

For the data analysis we will take a random sample of 50.000 texts representing the datasets evenly to be more manageable.

length.data <- 50000
original_lines <- c(
  sample(twitter_lines, length.data/3),
  sample(blogs_lines, length.data/3),
  sample(news_lines, length.data/3)
)
length(original_lines)
## [1] 49998
original_lines <- data.frame(sentence=original_lines)

Single Words

words <- original_lines %>%
  unnest_tokens(word, sentence) %>%
  count(word, sort = T) %>% 
  mutate(count=1:n(), freq=n/sum(n)) %>%
  mutate(accum_freq=cumsum(freq)) 
words %>% slice_head(n=10)
##    word     n count       freq accum_freq
## 1   the 72616     1 0.04931926 0.04931926
## 2    to 40327     2 0.02738925 0.07670851
## 3   and 38014     3 0.02581831 0.10252682
## 4     a 35713     4 0.02425552 0.12678234
## 5    of 31370     5 0.02130584 0.14808818
## 6    in 24468     6 0.01661815 0.16470633
## 7     i 22267     7 0.01512328 0.17982961
## 8  that 15644     8 0.01062508 0.19045468
## 9   for 15195     9 0.01032012 0.20077481
## 10   is 15066    10 0.01023251 0.21100732
cat("There are", dim(words)[1], "different words in the texts")
## There are 66701 different words in the texts
words %>% ggplot(aes(x=count, y=freq)) + geom_line()

words.90percentile <- which.min(abs(words$accum_freq-0.9))
words %>% 
  ggplot(aes(x=count, y=accum_freq)) + 
  geom_line() +
  geom_hline(yintercept=0.9, color="red") +
  geom_vline(xintercept=words.90percentile, color="red")

cat(words.90percentile, "words make up for 90% of the instances")
## 7833 words make up for 90% of the instances

2-grams

two.grams <- original_lines %>%
  unnest_ngrams(ngram, sentence, n=2) %>%
  count(ngram, sort = T) %>% 
  mutate(count=1:n(), freq=n/sum(n)) %>%
  mutate(accum_freq=cumsum(freq)) 
two.grams %>% slice_head(n=10)
      ngram    n count        freq  accum_freq
1    of the 7017     1 0.004709810 0.004709810
2    in the 6311     2 0.004235943 0.008945753
3    to the 3197     3 0.002145826 0.011091579
4    on the 3021     4 0.002027695 0.013119274
5   for the 2859     5 0.001918961 0.015038235
6     to be 2429     6 0.001630345 0.016668580
7   and the 2119     7 0.001422273 0.018090852
8    at the 2085     8 0.001399452 0.019490304
9      in a 1862     9 0.001249774 0.020740078
10 with the 1698    10 0.001139698 0.021879776
two.grams.90percentile <- which.min(abs(two.grams$accum_freq-0.9))
two.grams %>% 
  ggplot(aes(x=count, y=accum_freq)) + 
  geom_line() +
  geom_hline(yintercept=0.9, color="red") +
  geom_vline(xintercept=two.grams.90percentile, color="red")

cat(two.grams.90percentile, "2-grams make up for 90% of the instances")
## 507993 2-grams make up for 90% of the instances

3-grams

three.grams <- original_lines %>%
  unnest_ngrams(ngram, sentence, n=3) %>%
  count(ngram, sort = T) %>% 
  mutate(count=1:n(), freq=n/sum(n)) %>%
  mutate(accum_freq=cumsum(freq)) 
three.grams %>% slice_head(n=10)
         ngram   n count         freq   accum_freq
1   one of the 490     1 0.0003288882 0.0003288882
2     a lot of 480     2 0.0003221762 0.0006510644
3   as well as 258     3 0.0001731697 0.0008242341
4      to be a 254     4 0.0001704849 0.0009947190
5     it was a 251     5 0.0001684713 0.0011631903
6   the end of 249     6 0.0001671289 0.0013303192
7   out of the 247     7 0.0001657865 0.0014961057
8  going to be 236     8 0.0001584033 0.0016545090
9  some of the 233     9 0.0001563897 0.0018108987
10 part of the 211    10 0.0001416233 0.0019525220

4-grams

four.grams <- original_lines %>%
  unnest_ngrams(ngram, sentence, n=4) %>%
  count(ngram, sort = T) %>% 
  mutate(count=1:n(), freq=n/sum(n)) %>%
  mutate(accum_freq=cumsum(freq)) 
four.grams %>% slice_head(n=10)
                ngram   n count         freq   accum_freq
1      the end of the 136     1 9.128332e-05 9.128332e-05
2     the rest of the 109     2 7.316089e-05 1.644442e-04
3       at the end of 104     3 6.980489e-05 2.342491e-04
4  for the first time  86     4 5.772327e-05 2.919724e-04
5    at the same time  76     5 5.101126e-05 3.429836e-04
6    when it comes to  65     6 4.362806e-05 3.866117e-04
7    in the middle of  63     7 4.228565e-05 4.288973e-04
8       is one of the  63     8 4.228565e-05 4.711830e-04
9       to be able to  63     9 4.228565e-05 5.134687e-04
10    one of the most  58    10 3.892965e-05 5.523983e-04