This document analyses a dataset comprised of samples of the datasets Blogs, News and Tweets. The analysis focused on the creation of n-grams and their frequencies.
library(ggplot2)
library(dplyr)
library(tidytext)
twitter_lines <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
blogs_lines <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
news_lines <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")
## 2360148 lines in the twitter dataset
## 899287 lines in the blogs dataset
## 1010242 lines in the news dataset
We can see that the length of the texts of the texts in the twitter dataset its between 1 and 140 given the length limit on the tweets set by the website. The other two datasets have lengths up to 40.000, but most of the texts are not that long.
hist(nchar(twitter_lines), main="Length frequency Twitter Dataset", xlab="Length")
hist(log10(nchar(blogs_lines)), main="Log Length frequency Blogs Dataset", xlab="Log10 Length")
hist(log10(nchar(news_lines)), main="Log Length frequency News Dataset", xlab="Log10 Length")
For the data analysis we will take a random sample of 50.000 texts representing the datasets evenly to be more manageable.
length.data <- 50000
original_lines <- c(
sample(twitter_lines, length.data/3),
sample(blogs_lines, length.data/3),
sample(news_lines, length.data/3)
)
length(original_lines)
## [1] 49998
original_lines <- data.frame(sentence=original_lines)
words <- original_lines %>%
unnest_tokens(word, sentence) %>%
count(word, sort = T) %>%
mutate(count=1:n(), freq=n/sum(n)) %>%
mutate(accum_freq=cumsum(freq))
words %>% slice_head(n=10)
## word n count freq accum_freq
## 1 the 72616 1 0.04931926 0.04931926
## 2 to 40327 2 0.02738925 0.07670851
## 3 and 38014 3 0.02581831 0.10252682
## 4 a 35713 4 0.02425552 0.12678234
## 5 of 31370 5 0.02130584 0.14808818
## 6 in 24468 6 0.01661815 0.16470633
## 7 i 22267 7 0.01512328 0.17982961
## 8 that 15644 8 0.01062508 0.19045468
## 9 for 15195 9 0.01032012 0.20077481
## 10 is 15066 10 0.01023251 0.21100732
cat("There are", dim(words)[1], "different words in the texts")
## There are 66701 different words in the texts
words %>% ggplot(aes(x=count, y=freq)) + geom_line()
words.90percentile <- which.min(abs(words$accum_freq-0.9))
words %>%
ggplot(aes(x=count, y=accum_freq)) +
geom_line() +
geom_hline(yintercept=0.9, color="red") +
geom_vline(xintercept=words.90percentile, color="red")
cat(words.90percentile, "words make up for 90% of the instances")
## 7833 words make up for 90% of the instances
two.grams <- original_lines %>%
unnest_ngrams(ngram, sentence, n=2) %>%
count(ngram, sort = T) %>%
mutate(count=1:n(), freq=n/sum(n)) %>%
mutate(accum_freq=cumsum(freq))
two.grams %>% slice_head(n=10)
ngram n count freq accum_freq
1 of the 7017 1 0.004709810 0.004709810
2 in the 6311 2 0.004235943 0.008945753
3 to the 3197 3 0.002145826 0.011091579
4 on the 3021 4 0.002027695 0.013119274
5 for the 2859 5 0.001918961 0.015038235
6 to be 2429 6 0.001630345 0.016668580
7 and the 2119 7 0.001422273 0.018090852
8 at the 2085 8 0.001399452 0.019490304
9 in a 1862 9 0.001249774 0.020740078
10 with the 1698 10 0.001139698 0.021879776
two.grams.90percentile <- which.min(abs(two.grams$accum_freq-0.9))
two.grams %>%
ggplot(aes(x=count, y=accum_freq)) +
geom_line() +
geom_hline(yintercept=0.9, color="red") +
geom_vline(xintercept=two.grams.90percentile, color="red")
cat(two.grams.90percentile, "2-grams make up for 90% of the instances")
## 507993 2-grams make up for 90% of the instances
three.grams <- original_lines %>%
unnest_ngrams(ngram, sentence, n=3) %>%
count(ngram, sort = T) %>%
mutate(count=1:n(), freq=n/sum(n)) %>%
mutate(accum_freq=cumsum(freq))
three.grams %>% slice_head(n=10)
ngram n count freq accum_freq
1 one of the 490 1 0.0003288882 0.0003288882
2 a lot of 480 2 0.0003221762 0.0006510644
3 as well as 258 3 0.0001731697 0.0008242341
4 to be a 254 4 0.0001704849 0.0009947190
5 it was a 251 5 0.0001684713 0.0011631903
6 the end of 249 6 0.0001671289 0.0013303192
7 out of the 247 7 0.0001657865 0.0014961057
8 going to be 236 8 0.0001584033 0.0016545090
9 some of the 233 9 0.0001563897 0.0018108987
10 part of the 211 10 0.0001416233 0.0019525220
four.grams <- original_lines %>%
unnest_ngrams(ngram, sentence, n=4) %>%
count(ngram, sort = T) %>%
mutate(count=1:n(), freq=n/sum(n)) %>%
mutate(accum_freq=cumsum(freq))
four.grams %>% slice_head(n=10)
ngram n count freq accum_freq
1 the end of the 136 1 9.128332e-05 9.128332e-05
2 the rest of the 109 2 7.316089e-05 1.644442e-04
3 at the end of 104 3 6.980489e-05 2.342491e-04
4 for the first time 86 4 5.772327e-05 2.919724e-04
5 at the same time 76 5 5.101126e-05 3.429836e-04
6 when it comes to 65 6 4.362806e-05 3.866117e-04
7 in the middle of 63 7 4.228565e-05 4.288973e-04
8 is one of the 63 8 4.228565e-05 4.711830e-04
9 to be able to 63 9 4.228565e-05 5.134687e-04
10 one of the most 58 10 3.892965e-05 5.523983e-04