library(tidyverse) #ggplot and general data wrangling and reading
library(ngram) #nchar function
library(knitr) #table visualization
library(kableExtra) #table visualization
library(tidytext) #text handling
library(parallel) #parallel processing
library(cowplot) #ggpplot2 subplots

Visual Exploratory Analysis of File Info

First, I created two vectors containing the different languages as specified in the files, and the origin of the files, in order to paste both and create the path to each file.

langs <- c('en_US/en_US', 'ru_RU/ru_RU', 'de_DE/de_DE', 'fi_FI/fi_FI')
texts <- c('.blogs.txt', '.news.txt', '.twitter.txt')

names <- NULL
for(lang in langs) {
    for(text in texts) {
        file <- paste0(lang, text)
        names <- c(names, file)
    }
}

Then I created a dataframe where the first column is the result of the previous loop. Next I created a function that takes the path to a file a returns a vector containing its size in Mb, the number of lines, the number of characters and the number of words of the file.

text_summary <- data.frame(fileNames = names)

summaryOfFiles <- function(file) {
    row <- NULL
    size <- round(file.size(file) / 10^6, 3)
    read <- read_lines(file, skip_empty_rows = T)
    lines <- length(read)
    char <- sum(nchar(read))
    words <- wordcount(read, sep = ' ')
    row <- c(file, size, lines, char, words)
    return(row)
}

Next, I used the apply() function on the text_summary dataFrame using the summaryOfFiles as function to be applied. Since the apply() function returns a matrix I will merge its transposed with the original Dataframe.

summaryPerFile <- apply(text_summary, 1, summaryOfFiles)
text_summary <- merge(text_summary, t(summaryPerFile), by = 'fileNames')

Then, I do some formatting of the column names, create new columns to denote language and type of text and change characters to vectors in the count columns

colnames(text_summary)[2:5] <- c('fileSizeInMb', 'numberOfLines', 'numberOfCharacters', 'numberOfWords')
text_summary$language <- rep(c('Deutsch', 'English','Finnish','Russian'), c(3, 3, 3, 3))
text_summary$fileType <- rep(c('Blogs', 'News', 'Twitter'), 4)

text_summary[, 2] <- as.numeric(text_summary[, 2])
text_summary[, 3] <- as.numeric(text_summary[, 3])
text_summary[, 4] <- as.numeric(text_summary[, 4])
text_summary[, 5] <- as.numeric(text_summary[, 5])

This is the info collected from each file

text_summary %>%
  kable() %>%
  kable_styling()

fileNames	fileSizeInMb	numberOfLines	numberOfCharacters	numberOfWords	language	fileType
de_DE/de_DE.blogs.txt	85.460	371440	83204145	12653019	Deutsch	Blogs
de_DE/de_DE.news.txt	95.592	244743	93388799	13219287	Deutsch	News
de_DE/de_DE.twitter.txt	75.578	947774	72776632	11803476	Deutsch	Twitter
en_US/en_US.blogs.txt	210.160	899288	206824505	37334131	English	Blogs
en_US/en_US.news.txt	205.812	1010243	203223160	34372531	English	News
en_US/en_US.twitter.txt	167.105	2360148	162096031	30373543	English	Twitter
fi_FI/fi_FI.blogs.txt	108.504	439785	102911932	12731005	Finnish	Blogs
fi_FI/fi_FI.news.txt	94.234	485758	89557513	10445964	Finnish	News
fi_FI/fi_FI.twitter.txt	25.331	285214	23685374	3152758	Finnish	Twitter
ru_RU/ru_RU.blogs.txt	116.856	337100	64103385	9405378	Russian	Blogs
ru_RU/ru_RU.news.txt	118.996	196360	64956603	9115829	Russian	News
ru_RU/ru_RU.twitter.txt	105.182	881414	57950970	9223835	Russian	Twitter

To understand this information better, here I show some barplots for each file

base <- ggplot(text_summary, aes(x = language, fill = fileType)) + theme_minimal() + labs(x = 'Language', fill = 'File Type')

base + geom_bar(stat = 'identity', aes(y = numberOfLines / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of (in Millions)', title = 'Number of Lines per File')

base + geom_bar(stat = 'identity', aes(y = numberOfWords / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of Words (in Millions)', title = 'Number of Words per File')

base + geom_bar(stat = 'identity', aes(y = numberOfCharacters / 10^6), position = position_dodge(), color = 'black') + labs(y = '# of Characters (in Millions)', title = 'Number of Characters per File')

base + geom_bar(stat = 'identity', aes(y = fileSizeInMb), position = position_dodge(), color = 'black') + labs(y = 'File Size (Mb)', title = 'File Size in Mb')

From these plots it is possible to conclude that the distribution is not the same for the four languages and the three file types. In some languages, the news count is greater than blogs or twitter, but for other languages this is not the case. However, one thing is clear: the English files are much larger in every metric than the files in other languages. In all languages, there are more lines in the twitter files but have the least amount of words and characters, meaning shorter sentences, as expected.

Visual Exploratory Analysis of Words Counts and Distribution in English Files

Finally, in order to see the distribution of words, bigrams and trigrams for the english language files, I will use a special lapply() function, in order to iterate efficiently on the three files. Here I explain the step-by-step approach.

Use the parallel package to do do the job in parallel
Select the number of cores and start the cluster with makePSOCKcluster()
Export the libraries and variables to be used inside the parLapply() using clusterEvalQ() and clusterExport(), respectively
Read the file and sample it randomly only 10% of the original file
Create a vector of number of characters in the whole file
Create the tibbles (data.Frames) for the count of each word, bigram and trigram. Note: the stop words were removed in order to get a better insight; these stop words include ‘of’, ‘the’, ‘and’, and many more. The dataFrame of these words can be accessed from the tidytext package
Plot the top 20 entries in each tibble
Return a list that contains a vector of characters, a list of the tibbles and a list of the plots
This process is done in parallel for each file
Stop the cluster
Rename the result to the file names

ncores <- 3
cl <- makePSOCKcluster(ncores)
clusterEvalQ(cl, library(tidyverse))
clusterEvalQ(cl, library(tidytext))
clusterExport(cl, "names")
start <- Sys.time()
word_dist <- parLapply(cl, seq(names[1:3]), function(i) {
  data("stop_words")
  set.seed(123456)
  pct <- 0.1
  file <- read_lines(names[i], skip_empty_rows = T)
  n_characters <- nchar(file)

  text <- tibble(text = file) %>%
    sample_n(., pct * nrow(.))
  text_df <- text %>%
      unnest_tokens(word, text)

  text_count_sw <- text %>%
      unnest_tokens(word, text) %>%
      anti_join(stop_words) %>%
      count(word, sort = TRUE)

  bigram_count_sw <- text %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    count(bigram, sort = TRUE) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word) %>%
    unite(bigram, word1, word2, sep = " ")

  trigram_count_sw <- text %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
    count(trigram, sort = TRUE) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word) %>%
    filter(!word3 %in% stop_words$word) %>%
    unite(trigram, word1, word2, word3, sep = " ")

  text_counts <- list(text_count_sw = text_count_sw,bigram_count_sw = bigram_count_sw, trigram_count_sw = trigram_count_sw)

  word_plot_sw <- ggplot(text_count_sw[1:20,], aes(x = n, y = fct_reorder(word, n))) +
      geom_bar(stat = 'identity', fill = 'red3') + theme_minimal() +
      labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words with No Stopwords')
 
  bigram_plot_sw <- ggplot(bigram_count_sw[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
    geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
    labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams with No Stopwords')
  
  trigram_plot_sw <- ggplot(trigram_count_sw[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
      geom_bar(stat = 'identity', fill = 'blue3') + theme_minimal() +
      labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams with No Stopwords')
   
  plots <- list(word_plot_sw = word_plot_sw, bigram_plot_sw = bigram_plot_sw, trigram_plot_sw = trigram_plot_sw)
  
  return(list(n_characters = n_characters, text_counts = text_counts, plots = plots))
})
end <- Sys.time()
time <- end - start
stopCluster(cl)

names(word_dist) <- names[1:3]

After running this long parLapply() function, the time it took was 2.862 minutes, in a 16-Gb Intel Core-i7 7th Gen Lenovo Laptop.

To see the structure of the result of this process I use str(), but the output is too long, so it is not seen in this page. But the important thing to keep in mind is that the result is a list of lists that can be accessed using standard R.

For instance, the following code, with the help of the cowplot package, displays the barplots for the blogs file.

Blogs Top 20 Words, Bigrams and Trigrams

plot_grid(word_dist[[1]]$plots$word_plot_sw, word_dist[[1]]$plots$bigram_plot_sw, word_dist[[1]]$plots$trigram_plot_sw, rows = 3)

Similarly, I can do the same for the news and twitter files.

News Top 20 Words, Bigrams and Trigrams

plot_grid(word_dist[[2]]$plots$word_plot_sw, word_dist[[2]]$plots$bigram_plot_sw, word_dist[[2]]$plots$trigram_plot_sw, rows = 3)

Twitter Top 20 Words, Bigrams and Trigrams

plot_grid(word_dist[[3]]$plots$word_plot_sw, word_dist[[3]]$plots$bigram_plot_sw, word_dist[[3]]$plots$trigram_plot_sw, rows = 3)

Character Distribution in each File

Finally, I want to show the distribution of the number of characters in each of these three files. For this, I also use the list I created.

blogs_dist <- ggplot(data_frame(blogs = word_dist[[1]]$n_characters)) + geom_histogram(aes(blogs), color = 'black', fill = '#F8766D') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the Blogs File')
news_dist <- ggplot(data_frame(news = word_dist[[2]]$n_characters)) + geom_histogram(aes(news), color = 'black', fill = '#00BA38') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the News File')
twitter_dist <- ggplot(data_frame(twitter = word_dist[[3]]$n_characters)) + geom_histogram(aes(twitter), color = 'black', fill = '#619CFF') + theme_minimal() + labs(x = '# of Characters', y = 'Frequency', title = 'Number of Characters in each Line of the Twitter File')

plot_grid(blogs_dist, news_dist, twitter_dist, rows = 3)

As expected, both blogs and news entries have a greater variance in terms of number of characters, but Twitter length is restricted, therefore the distribution is clearer for the latter and the former two.

Conclusion

As seen in the barplots of word, bigrams and trigrams counts, the most common are words like love, time, people, but the combinations of them are very different depending on the source. For instance, in twitter the most common trigrams are related to Holidays like Mother’s Day and Cinco de Mayo; in the news the most common trigram is President Barack Obama, while in the blogs it is not clear. Some more work needs to be done in order to assess the word distribution in the complete files and for every language, but this approach can be reproduced with more computational power to achieve this.

Additional Code

This code below can perform the same task of collecting the word count, bigrams, trigrams and their plots for each of the twelve files. However, given the size of the files and the time it takes to count the words, this may take several hours.

ncores <- 3
cl <- makePSOCKcluster(ncores)
clusterEvalQ(cl, library(tidyverse))
clusterEvalQ(cl, library(tidytext))
clusterExport(cl, "names")
start <- Sys.time()
word_dist <- parLapply(cl, seq(names[1:12]), function(i) {
  file <- read_lines(names[i], skip_empty_rows = T)
  n_characters <- nchar(file)
  print(paste('Done nchar for', names[i]))
  text <- tibble(text = file)
  text_df <- text %>%
      unnest_tokens(word, text)
  print(paste('Done text_df for', names[i]))
  text_count <- text_df %>%
      count(word, sort = TRUE)
  print(paste('Done count for', names[i]))
  text_count_sw <- text %>%
      unnest_tokens(word, text) %>%
      anti_join(stop_words) %>%
      count(word, sort = TRUE)
  print(paste('Done count_sw for', names[i]))

  bigram_count <- text %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    count(bigram, sort = TRUE)
  print(paste('Done bigram for', names[i]))

  bigram_count_sw <- text %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    count(bigram, sort = TRUE) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word) %>%
    unite(bigram, word1, word2, sep = " ")
  print(paste('Done bigram_sw for', names[i]))

  trigram_count <- text %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
    count(trigram, sort = TRUE)
  print(paste('Done trigram for', names[i]))

  trigram_count_sw <- text %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
    count(trigram, sort = TRUE) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
    filter(!word1 %in% stop_words$word) %>%
    filter(!word2 %in% stop_words$word) %>%
    filter(!word3 %in% stop_words$word) %>%
    unite(trigram, word1, word2, word3, sep = " ")
  print(paste('Done trigram_sw for', names[i]))

  # text_counts <- list(text_count_sw = text_count_sw,bigram_count_sw = bigram_count_sw, trigram_count_sw = trigram_count_sw)

  text_counts <- list(text_count = text_count, text_count_sw = text_count_sw, bigram_count = bigram_count, bigram_count_sw = bigram_count_sw, trigram_count = trigram_count, trigram_count_sw = trigram_count_sw)

  word_plot <- ggplot(text_count[1:20,], aes(x = n, y = fct_reorder(word, n))) +
  geom_bar(stat = 'identity', fill = 'darkred') + theme_minimal() +
  labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words')

  word_plot_sw <- ggplot(text_count_sw[1:20,], aes(x = n, y = fct_reorder(word, n))) +
      geom_bar(stat = 'identity', fill = 'red3') + theme_minimal() +
      labs(x = 'Appearances in Text', y = 'Word', title = 'Top 20 Most Common Words with No Stopwords')


  bigram_plot <- ggplot(bigram_count[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
    geom_bar(stat = 'identity', fill = 'darkgreen') + theme_minimal() +
    labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams')

  bigram_plot_sw <- ggplot(bigram_count_sw[1:20,], aes(x = n, y = fct_reorder(bigram, n))) +
    geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
    labs(x = 'Appearances in Text', y = 'Bigram', title = 'Top 20 Bigrams with No Stopwords')

  trigram_plot <- ggplot(trigram_count[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
  geom_bar(stat = 'identity', fill = 'darkblue') + theme_minimal() +
  labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams')

  trigram_plot_sw <- ggplot(trigram_count_sw[1:20,], aes(x = n, y = fct_reorder(trigram, n))) +
      geom_bar(stat = 'identity', fill = 'green3') + theme_minimal() +
      labs(x = 'Appearances in Text', y = 'trigram', title = 'Top 20 Trigrams with No Stopwords')

  plots <- list(word_plot = word_plot, word_plot_sw = word_plot_sw, bigram_plot = bigram_plot, bigram_plot_sw = bigram_plot_sw, trigram_plot = trigram_plot, trigram_plot_sw = trigram_plot_sw)
  # plots <- list(word_plot, word_plot_sw, bigram_plot, bigram_plot_sw, trigram_plot, trigram_plot_sw)
  
  return(list(n_characters = n_characters, text_counts = text_counts, plots = plots))
})
end <- Sys.time()
time <- end - start
stopCluster(cl)

names(word_dist) <- names[1:12]

Exploratory Data Analysis of SwiftKey Data

Andres Camilo Zuñiga Gonzalez

19/7/2020