Loading required packages

library("readr")
library("stringi")
library("stringr")
library("tidyverse")
library("tidytext")

Loading data

if (!file.exists("Coursera-SwiftKey.zip")){
  download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip")
}

#unzip("Coursera-SwiftKey.zip", list = TRUE)
source_folder <- "final/en_US/"
for(source_file in list.files(source_folder)) {
  source <- gsub(pattern = "en_US.|.txt", replacement = "", x = source_file)
  assign(x = source, value = read_lines(paste0(source_folder, source_file)))
}

Data summary

BLOGS source is 899288 lines long with a total of 206824505 words, NEWS source is 1010242 lines long with a total of 203223159 words and the last text source TWITTER, is 2360148 lines long with a total of 162096031 words. Given the size of those sources, a sample of 5% of the total lines of each blogs, news, and twiiter datasets will be taken.

n_size <- c(
  blogs   = round(length(blogs) * 0.05),
  news    = round(length(news) * 0.05),
  twitter = round(length(twitter) * 0.05)
)

set.seed(2021L)
source_corpus <- list(
  blogs   = as.data.frame(blogs) %>% sample_n(n_size["blogs"]),
  news    = as.data.frame(news) %>% sample_n(n_size["news"]),
  twitter = as.data.frame(twitter) %>% sample_n(n_size["twitter"])
)

The sample from each source is saved as an element of a list. From it, we can now get the most used words from each one. It can be seen that those words look alike from the three sources, specifically words as articles and pronouns.

word_list <- lapply(source_corpus, function(x) x %>% unnest_tokens(word, 1))
top_words <- lapply(word_list, function(x) head(x %>% count(word, sort = T), 15))

data.frame(top_words["blogs"]) %>%
  ggplot(aes(x = reorder(blogs.word, -blogs.n), y = blogs.n)) +
  geom_bar(stat="identity", fill="darkblue") +
  labs(title = "Top 15 most used words in Blogs", x = "Word", y = "Frecuency")

data.frame(top_words["news"]) %>%
  ggplot(aes(x = reorder(news.word, -news.n), y = news.n)) +
  geom_bar(stat="identity", fill="darkred") +
  labs(title = "Top 15 most used words in News", x = "Word", y = "Frecuency")

data.frame(top_words["twitter"]) %>%
  ggplot(aes(x = reorder(twitter.word, -twitter.n), y = twitter.n)) +
  geom_bar(stat="identity", fill="lightblue") +
  labs(title = "Top 15 most used words in Twitter", x = "Word", y = "Frecuency")

To give a better idea of the structure of the sentences contained within the data set, n-grams of up to 3 combinations will be assembled to see their frequencies using the tm package. First, we can see the most used bigrams as follows.

ngram2_list <- lapply(source_corpus, function(x) x %>% unnest_tokens(ngram, 1, token = "ngrams", n = 2))
top_ngram2 <- lapply(ngram2_list, function(x) head(x %>% count(ngram, sort = T), 10))

data.frame(top_ngram2["blogs"]) %>%
  ggplot(aes(x = reorder(blogs.ngram, -blogs.n), y = blogs.n)) +
  geom_bar(stat="identity", fill="darkblue") +
  labs(title = "Top 15 most used bi-grams in Blogs", x = "Bigram", y = "Frecuency")

data.frame(top_ngram2["news"]) %>%
  ggplot(aes(x = reorder(news.ngram, -news.n), y = news.n)) +
  geom_bar(stat="identity", fill="darkred") +
  labs(title = "Top 12 most used bi-grams in News", x = "Bigram", y = "Frecuency")

data.frame(top_ngram2["twitter"]) %>%
  ggplot(aes(x = reorder(twitter.ngram, -twitter.n), y = twitter.n)) +
  geom_bar(stat="identity", fill="lightblue") +
  labs(title = "Top 12 most used bi-grams in Twitter", x = "Bigram", y = "Frecuency")

Now, the construction of trigrams, and specifically, of the most used trigrams, can give us a better understanding of the content that is highlighted in the sample of each source, and it will even give us a better idea of how we could predict what follows immediately after a simple bigram.

ngram3_list <- lapply(source_corpus, function(x) x %>% unnest_tokens(ngram, 1, token = "ngrams", n = 3))
top_ngram3 <- lapply(ngram3_list, function(x) head(x %>% count(ngram, sort = T), 10))

data.frame(top_ngram3["blogs"]) %>% filter(!is.na(blogs.ngram)) %>%
  ggplot(aes(x = reorder(blogs.ngram, -blogs.n), y = blogs.n)) +
  geom_bar(stat="identity", fill="darkblue") +
  labs(title = "Top 10 most used tri-grams in Blogs", x = "Trigram", y = "Frecuency")

data.frame(top_ngram3["news"]) %>% filter(!is.na(news.ngram)) %>%
  ggplot(aes(x = reorder(news.ngram, -news.n), y = news.n)) +
  geom_bar(stat="identity", fill="darkred") +
  labs(title = "Top 10 most used tri-grams in News", x = "Trigram", y = "Frecuency")

data.frame(top_ngram3["twitter"]) %>% filter(!is.na(twitter.ngram)) %>%
  ggplot(aes(x = reorder(twitter.ngram, -twitter.n), y = twitter.n)) +
  geom_bar(stat="identity", fill="lightblue") +
  labs(title = "Top 10 most used tri-grams in Twitter", x = "Trigram", y = "Frecuency")