library(tidyverse) #ggplot and general data wrangling and reading
library(ngram) #nchar function
library(knitr) #table visualization
library(kableExtra) #table visualization
library(tidytext) #text handling
library(parallel) #parallel processing
library(cowplot) #ggpplot2 subplots
First, I created two vectors containing the different languages as specified in the files, and the origin of the files, in order to paste both and create the path to each file.
langs <- c('en_US/en_US', 'ru_RU/ru_RU', 'de_DE/de_DE', 'fi_FI/fi_FI')
texts <- c('.blogs.txt', '.news.txt', '.twitter.txt')
names <- NULL
for(lang in langs) {
for(text in texts) {
file <- paste0(lang, text)
names <- c(names, file)
}
}
Then I created a dataframe where the first column is the result of the previous loop. Next I created a function that takes the path to a file a returns a vector containing its size in Mb, the number of lines, the number of characters and the number of words of the file.
text_summary <- data.frame(fileNames = names)
summaryOfFiles <- function(file) {
row <- NULL
size <- round(file.size(file) / 10^6, 3)
read <- read_lines(file, skip_empty_rows = T)
lines <- length(read)
char <- sum(nchar(read))
words <- wordcount(read, sep = ' ')
row <- c(file, size, lines, char, words)
return(row)
}
Next, I used the apply() function on the text_summary dataFrame using the summaryOfFiles as function to be applied. Since the apply() function returns a matrix I will merge its transposed with the original Dataframe.
summaryPerFile <- apply(text_summary, 1, summaryOfFiles)
text_summary <- merge(text_summary, t(summaryPerFile), by = 'fileNames')
Then, I do some formatting of the column names, create new columns to denote language and type of text and change characters to vectors in the count columns
colnames(text_summary)[2:5] <- c('fileSizeInMb', 'numberOfLines', 'numberOfCharacters', 'numberOfWords')
text_summary$language <- rep(c('Deutsch', 'English','Finnish','Russian'), c(3, 3, 3, 3))
text_summary$fileType <- rep(c('Blogs', 'News', 'Twitter'), 4)
text_summary[, 2] <- as.numeric(text_summary[, 2])
text_summary[, 3] <- as.numeric(text_summary[, 3])
text_summary[, 4] <- as.numeric(text_summary[, 4])
text_summary[, 5] <- as.numeric(text_summary[, 5])
This is the info collected from each file
text_summary %>%
kable() %>%
kable_styling()
| fileNames | fileSizeInMb | numberOfLines | numberOfCharacters | numberOfWords | language | fileType |
|---|---|---|---|---|---|---|
| de_DE/de_DE.blogs.txt | 85.460 | 371440 | 83204145 | 12653019 | Deutsch | Blogs |
| de_DE/de_DE.news.txt | 95.592 | 244743 | 93388799 | 13219287 | Deutsch | News |
| de_DE/de_DE.twitter.txt | 75.578 | 947774 | 72776632 | 11803476 | Deutsch | |
| en_US/en_US.blogs.txt | 210.160 | 899288 | 206824505 | 37334131 | English | Blogs |
| en_US/en_US.news.txt | 205.812 | 1010243 | 203223160 | 34372531 | English | News |
| en_US/en_US.twitter.txt | 167.105 | 2360148 | 162096031 | 30373543 | English | |
| fi_FI/fi_FI.blogs.txt | 108.504 | 439785 | 102911932 | 12731005 | Finnish | Blogs |
| fi_FI/fi_FI.news.txt | 94.234 | 485758 | 89557513 | 10445964 | Finnish | News |
| fi_FI/fi_FI.twitter.txt | 25.331 | 285214 | 23685374 | 3152758 | Finnish | |
| ru_RU/ru_RU.blogs.txt | 116.856 | 337100 | 64103385 | 9405378 | Russian | Blogs |
| ru_RU/ru_RU.news.txt | 118.996 | 196360 | 64956603 | 9115829 | Russian | News |
| ru_RU/ru_RU.twitter.txt | 105.182 | 881414 | 57950970 | 9223835 | Russian |
To understand this information better, here I show some barplots for each file
base <- ggplot(text_summary, aes(x = language, fill = fileType)) + theme_minimal() + labs(x = 'Language', fill = 'File Type')
base + geom_bar(stat = 'identity', aes(y = numberOfLines / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of (in Millions)', title = 'Number of Lines per File')
base + geom_bar(stat = 'identity', aes(y = numberOfWords / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of Words (in Millions)', title = 'Number of Words per File')
base + geom_bar(stat = 'identity', aes(y = numberOfCharacters / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of Characters (in Millions)', title = 'Number of Characters per File')
base + geom_bar(stat = 'identity', aes(y = fileSizeInMb), position = position_dodge(), color = 'black') + labs(y = 'File Size (Mb)', title = 'File Size in Mb')
From these plots it is possible to conclude that the distribution is not the same for the four languages and the three file types. In some languages, the news count is greater than blogs or twitter, but for other languages this is not the case. However, one thing is clear: the English files are much larger in every metric than the files in other languages. In all languages, there are more lines in the twitter files but have the least amount of words and characters, meaning shorter sentences, as expected.
Finally, in order to see the distribution of words, bigrams and trigrams for the english language files, I will use a special lapply() function, in order to iterate efficiently on the three files. Here I explain the step-by-step approach.
parallel package to do do the job in parallelmakePSOCKcluster()parLapply() using clusterEvalQ() and clusterExport(), respectivelytidytext packagencores <- 3
cl <- makePSOCKcluster(ncores)
clusterEvalQ(cl, library(tidyverse))
clusterEvalQ(cl, library(tidytext))
clusterExport(cl, "names")
start <- Sys.time()
word_dist <- parLapply(cl, seq(names[1:3]), function(i) {
data("stop_words")
set.seed(123456)
pct <- 0.1
file <- read_lines(names[i], skip_empty_rows = T)
n_characters <- nchar(file)
text <- tibble(text = file) %>%
sample_n(., pct * nrow(.))
text_df <- text %>%
unnest_tokens(word, text)
text_count_sw <- text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)
bigram_count_sw <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ")
trigram_count_sw <- text %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
unite(trigram, word1, word2, word3, sep = " ")
text_counts <- list(text_count_sw = text_count_sw,bigram_count_sw = bigram_count_sw, trigram_count_sw = trigram_count_sw)
word_plot_sw <- ggplot(text_count_sw[1:20,], aes(x = n, y = fct_reorder(word, n))) +
geom_bar(stat = 'identity', fill = 'red3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words with No Stopwords')
bigram_plot_sw <- ggplot(bigram_count_sw[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams with No Stopwords')
trigram_plot_sw <- ggplot(trigram_count_sw[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
geom_bar(stat = 'identity', fill = 'blue3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams with No Stopwords')
plots <- list(word_plot_sw = word_plot_sw, bigram_plot_sw = bigram_plot_sw, trigram_plot_sw = trigram_plot_sw)
return(list(n_characters = n_characters, text_counts = text_counts, plots = plots))
})
end <- Sys.time()
time <- end - start
stopCluster(cl)
names(word_dist) <- names[1:3]
After running this long parLapply() function, the time it took was 2.862 minutes, in a 16-Gb Intel Core-i7 7th Gen Lenovo Laptop.
To see the structure of the result of this process I use str(), but the output is too long, so it is not seen in this page. But the important thing to keep in mind is that the result is a list of lists that can be accessed using standard R.
For instance, the following code, with the help of the cowplot package, displays the barplots for the blogs file.
plot_grid(word_dist[[1]]$plots$word_plot_sw, word_dist[[1]]$plots$bigram_plot_sw, word_dist[[1]]$plots$trigram_plot_sw, rows = 3)
Similarly, I can do the same for the news and twitter files.
plot_grid(word_dist[[2]]$plots$word_plot_sw, word_dist[[2]]$plots$bigram_plot_sw, word_dist[[2]]$plots$trigram_plot_sw, rows = 3)
plot_grid(word_dist[[3]]$plots$word_plot_sw, word_dist[[3]]$plots$bigram_plot_sw, word_dist[[3]]$plots$trigram_plot_sw, rows = 3)
Finally, I want to show the distribution of the number of characters in each of these three files. For this, I also use the list I created.
blogs_dist <- ggplot(data_frame(blogs = word_dist[[1]]$n_characters)) + geom_histogram(aes(blogs), color = 'black', fill = '#F8766D') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the Blogs File')
news_dist <- ggplot(data_frame(news = word_dist[[2]]$n_characters)) + geom_histogram(aes(news), color = 'black', fill = '#00BA38') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the News File')
twitter_dist <- ggplot(data_frame(twitter = word_dist[[3]]$n_characters)) + geom_histogram(aes(twitter), color = 'black', fill = '#619CFF') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the Twitter File')
plot_grid(blogs_dist, news_dist, twitter_dist, rows = 3)
As expected, both blogs and news entries have a greater variance in terms of number of characters, but Twitter length is restricted, therefore the distribution is clearer for the latter and the former two.
As seen in the barplots of word, bigrams and trigrams counts, the most common are words like love, time, people, but the combinations of them are very different depending on the source. For instance, in twitter the most common trigrams are related to Holidays like Mother’s Day and Cinco de Mayo; in the news the most common trigram is President Barack Obama, while in the blogs it is not clear. Some more work needs to be done in order to assess the word distribution in the complete files and for every language, but this approach can be reproduced with more computational power to achieve this.
This code below can perform the same task of collecting the word count, bigrams, trigrams and their plots for each of the twelve files. However, given the size of the files and the time it takes to count the words, this may take several hours.
ncores <- 3
cl <- makePSOCKcluster(ncores)
clusterEvalQ(cl, library(tidyverse))
clusterEvalQ(cl, library(tidytext))
clusterExport(cl, "names")
start <- Sys.time()
word_dist <- parLapply(cl, seq(names[1:12]), function(i) {
file <- read_lines(names[i], skip_empty_rows = T)
n_characters <- nchar(file)
print(paste('Done nchar for', names[i]))
text <- tibble(text = file)
text_df <- text %>%
unnest_tokens(word, text)
print(paste('Done text_df for', names[i]))
text_count <- text_df %>%
count(word, sort = TRUE)
print(paste('Done count for', names[i]))
text_count_sw <- text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)
print(paste('Done count_sw for', names[i]))
bigram_count <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
print(paste('Done bigram for', names[i]))
bigram_count_sw <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ")
print(paste('Done bigram_sw for', names[i]))
trigram_count <- text %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
print(paste('Done trigram for', names[i]))
trigram_count_sw <- text %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
unite(trigram, word1, word2, word3, sep = " ")
print(paste('Done trigram_sw for', names[i]))
# text_counts <- list(text_count_sw = text_count_sw,bigram_count_sw = bigram_count_sw, trigram_count_sw = trigram_count_sw)
text_counts <- list(text_count = text_count, text_count_sw = text_count_sw, bigram_count = bigram_count, bigram_count_sw = bigram_count_sw, trigram_count = trigram_count, trigram_count_sw = trigram_count_sw)
word_plot <- ggplot(text_count[1:20,], aes(x = n, y = fct_reorder(word, n))) +
geom_bar(stat = 'identity', fill = 'darkred') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words')
word_plot_sw <- ggplot(text_count_sw[1:20,], aes(x = n, y = fct_reorder(word, n))) +
geom_bar(stat = 'identity', fill = 'red3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words with No Stopwords')
bigram_plot <- ggplot(bigram_count[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
geom_bar(stat = 'identity', fill = 'darkgreen') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams')
bigram_plot_sw <- ggplot(bigram_count_sw[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams with No Stopwords')
trigram_plot <- ggplot(trigram_count[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
geom_bar(stat = 'identity', fill = 'darkblue') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams')
trigram_plot_sw <- ggplot(trigram_count_sw[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams with No Stopwords')
plots <- list(word_plot = word_plot, word_plot_sw = word_plot_sw, bigram_plot = bigram_plot, bigram_plot_sw = bigram_plot_sw, trigram_plot = trigram_plot, trigram_plot_sw = trigram_plot_sw)
# plots <- list(word_plot, word_plot_sw, bigram_plot, bigram_plot_sw, trigram_plot, trigram_plot_sw)
return(list(n_characters = n_characters, text_counts = text_counts, plots = plots))
})
end <- Sys.time()
time <- end - start
stopCluster(cl)
names(word_dist) <- names[1:12]