In order to understand the relationships and distributions of words in the data set, we will first do an exploratory data analysis.

Explore data

library(quanteda)
library(readtext)
library(stringi)
library(ggplot2)
library(cowplot)
library(reshape2)
library(dplyr)

Load Data

if(!file.exists(file.path(getwd(), 'data', 'final', 'en_US', 'en_US.blogs.txt'))){
  download.file('https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip',
                destfile = file.path(getwd(), 'data', 'Coursera-SwiftKey.zip'),
                method = 'curl', quiet = T)
  unzip(file.path(getwd(),'data', 'Coursera-SwiftKey.zip'), exdir = file.path(getwd(), 'data'))
}

blogs <- readtext(file.path(getwd(), 'data', 'final', 'en_US' ,'en_US.blogs.txt'))
news <- readtext(file.path(getwd(), 'data', 'final', 'en_US' ,'en_US.news.txt'))
tweets <- readtext(file.path(getwd(), 'data', 'final', 'en_US', 'en_US.twitter.txt'))

Number of Lines

lines <- data.frame('source' = c('blogs', 'news', 'tweets'),
                     'lines' = c(stri_count_fixed(blogs, '\n'),
                                 stri_count_fixed(news, '\n'),
                                 stri_count_fixed(tweets, '\n')))
lines
##   source   lines
## 1  blogs  899287
## 2   news 1010241
## 3 tweets 2360147

Instantiate Corpus Object for Analysis

blog_corpus <- corpus(blogs)
docvars(blog_corpus, 'Source') <- 'blogs'

news_corpus <- corpus(news)
docvars(news_corpus, 'Source') <- 'news'

twitter_corpus <- corpus(tweets)
docvars(twitter_corpus, 'Source') <- 'tweets'

#unfortunately the summary() function crashes repeatedly on the combined corpus so we are showing individual summaries below
#combine all three
#combined_corpus <- blog_corpus + news_corpus + twitter_corpus
combined_corpus <- c(blog_corpus, news_corpus, twitter_corpus)

#remove individual objects to free memory
#rm(blogs, blog_corpus, news, news_corpus, tweets, twitter_corpus)
rm(blogs, news, tweets )

Exploratory Analysis

Summary for the corpus:

corpus_summary <- data.frame()
for (i in 1:length(combined_corpus)){
  corpus_summary <- rbind(corpus_summary,summary(combined_corpus[i]))
}
corpus_summary
## Corpus consisting of 1 document, showing 1 document:
## 
##               Text  Types   Tokens Sentences Source
##    en_US.blogs.txt 488731 42829003   2072623  blogs
##     en_US.news.txt 436848 39913287   1869517   news
##  en_US.twitter.txt 563897 36726231   2580555 tweets

Plot word and sentences

word_plot <- ggplot(data = corpus_summary, aes(x = Source, y = Tokens, fill = Source)) +
  geom_col() +
  guides(fill = FALSE) +
  scale_y_continuous(expand = c(0, 0)) +
  ylab('Word counts')
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sentence_plot <- ggplot(data = corpus_summary, aes(x = Source, y = Sentences, fill = Source)) +
  geom_col() +
  scale_y_continuous(expand = c(0, 0)) +
  guides(fill = FALSE)

plot_grid(word_plot, sentence_plot, labels = 'AUTO')

Preprocessing

In the pre-processing step we will tokenise words and sentences, remove profanity etc.

tokenised_word_list <- list()
for(i in 1:length(combined_corpus)){
  tokenised_word_list[[i]] <-tokens(combined_corpus[i], what = 'word', remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE, remove_url = TRUE)
}

#profanity_url <- 'https://github.com/dsojevic/profanity-list/blob/main/en.txt'
profanity_url <- 'https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
profanity_file_name <- file.path(getwd(), 'profanity.txt')
if(!file.exists(profanity_file_name)){
  download.file(profanity_url, profanity_file_name)
}
profanity <- readLines(profanity_file_name)

for(i in 1:length(tokenised_word_list)){
  tokenised_word_list[[i]] <- tokens_remove(tokenised_word_list[[i]], pattern = profanity)
}

word_summary <- data.frame()

for(i in 1:length(tokenised_word_list)){
  word_summary <- rbind(word_summary, summary(tokenised_word_list[[i]]))  
}
word_summary
##                Var1   Var2      Freq
## 1   en_US.blogs.txt Length  36901924
## 2   en_US.blogs.txt  Class    -none-
## 3   en_US.blogs.txt   Mode character
## 4    en_US.news.txt Length  33542726
## 5    en_US.news.txt  Class    -none-
## 6    en_US.news.txt   Mode character
## 7 en_US.twitter.txt Length  29474639
## 8 en_US.twitter.txt  Class    -none-
## 9 en_US.twitter.txt   Mode character
tokenised_sentence_list <- list()
for(i in 1:length(combined_corpus)){
  tokenised_sentence_list[[i]] <-tokens(combined_corpus[i], what = 'sentence', remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE, remove_url = TRUE)
}

profanity_url <- 'https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
profanity_file_name <- file.path(getwd(), 'profanity.txt')
if(!file.exists(profanity_file_name)){
  download.file(profanity_url, profanity_file_name)
}
profanity <- readLines(profanity_file_name)

for(i in 1:length(tokenised_sentence_list)){
  tokenised_sentence_list[[i]] <- tokens_remove(tokenised_sentence_list[[i]], pattern = profanity)
}

sentence_summary <- data.frame()

for(i in 1:length(tokenised_sentence_list)){
  sentence_summary <- rbind(sentence_summary, summary(tokenised_sentence_list[[i]]))  
}
sentence_summary
##                Var1   Var2      Freq
## 1   en_US.blogs.txt Length   2072537
## 2   en_US.blogs.txt  Class    -none-
## 3   en_US.blogs.txt   Mode character
## 4    en_US.news.txt Length   1869516
## 5    en_US.news.txt  Class    -none-
## 6    en_US.news.txt   Mode character
## 7 en_US.twitter.txt Length   2578299
## 8 en_US.twitter.txt  Class    -none-
## 9 en_US.twitter.txt   Mode character

Plot tokenised words and sentences

plot_df <- data.frame(matrix(ncol=1,nrow=3))
 colnames(plot_df) <- c("Freq")
row.names(plot_df) <- c('blogs', 'news', 'tweets')
plot_df[1,1] <- word_summary[1,3]
plot_df[2,1] <- word_summary[4,3]
plot_df[3,1] <- word_summary[7,3]
tokenized_word_plot <- ggplot(data = plot_df, aes(x = row.names(plot_df), y = plot_df[, 1], fill = row.names(plot_df))) +
  geom_col() +
  guides(fill = "none") +
  xlab('Source') +
  ylab('Word counts')

plot_df[1,1] <- sentence_summary[1,3]
plot_df[2,1] <- sentence_summary[4,3]
plot_df[3,1] <- sentence_summary[7,3]
tokenized_sentence_plot <- ggplot(data = plot_df, aes(x = row.names(plot_df), y = plot_df[, 1], fill = row.names(plot_df))) +
  geom_col() +
  guides(fill = "none") +
  xlab('Source') +
  ylab('Sentence counts')
plot_grid(tokenized_word_plot, tokenized_sentence_plot, labels = 'AUTO')

Word Distributions

#make document feature matrix
word_dfm_list <- list()
for(i in 1:length(tokenised_word_list)){
  word_dfm_list[[i]] <- dfm(tokenised_word_list[[i]])
}
top_words <- list()
for(i in 1:length(word_dfm_list)){
  top_words[[i]] <- topfeatures(word_dfm_list[[i]], n=20)
}
gram1 <- data.frame()
gram1 <- cbind(as.data.frame(top_words[[1]]),as.data.frame(top_words[[2]]),as.data.frame(top_words[[3]]))
colnames(gram1)<- c('blogs', 'news', 'tweets')
gram1$words <- rownames(gram1)
df <- melt(gram1, id.vars = c('words'))
ggplot(data = df, aes(x = reorder(words, value), y = value)) +
  geom_col(aes(fill = variable)) +
  coord_flip() +
  ylab('Frequency') + xlab('Top 20 common words') +
  scale_y_continuous(expand = c(0, 0)) +
  guides(fill = guide_legend(title = 'Source')) +
  theme(legend.position = 'top')

Plot Percentage of Top Ten Words

percent_dfm_word <- list()
percent_df_word <- list()

for(i in 1:length(word_dfm_list)){
  percent_dfm_word[[i]] <- dfm_weight(word_dfm_list[[i]], scheme = 'prop') * 100
  percent_df_word[[i]] <- data.frame(topfeatures(percent_dfm_word[[i]]))
  colnames(percent_df_word[[i]]) <- 'percent'
  percent_df_word[[i]]$words <- rownames(percent_df_word[[i]])
}
percent_df_word[[1]]$source <- "blogs"
percent_df_word[[2]]$source <- "news"
percent_df_word[[3]]$source <- "tweets"
combined_percent_df_word <- bind_rows(percent_df_word[[1]],percent_df_word[[2]], percent_df_word[[3]])

ggplot(data = combined_percent_df_word, aes(x = words, y = percent, colour = source, group = source)) +
  geom_point() +
  geom_line() +
  scale_x_discrete(limits = combined_percent_df_word$words) +
  labs(
    x="Words",
    y= "Percentage",
    color= "Source"
  )

Remove common words for 1 gram analysis

  • The plot above shows the common words in English. Bring words back to their stem (root of the word) and remove the common words via remove = stopwords('english').
stemmed_dfm_list <- list()
no_common_word_dfm_list <- list()
for(i in 1:length(word_dfm_list)){
  stemmed_dfm_list[[i]] <- dfm_wordstem(word_dfm_list[[i]])
  no_common_word_dfm_list[[i]] <- dfm_select(stemmed_dfm_list[[i]], pattern=stopwords('english'), selection = 'remove')
  
}
  • we then identify the 20 most frequent words
top_20_word_list <- list()
top_20_word_df_list <- list()

for(i in 1:length(no_common_word_dfm_list)){
  top_20_word_list[[i]] <- topfeatures(no_common_word_dfm_list[[i]], n=20)
  top_20_word_df_list[[i]] <- as.data.frame(top_20_word_list[[i]])
  top_20_word_df_list[[i]]$words <- row.names(top_20_word_df_list[[i]])
}
top_20_word_df_list[[1]]$source <- "blogs"
top_20_word_df_list[[2]]$source <- "news"
top_20_word_df_list[[3]]$source <- "tweets"

combined_top_20_df <- bind_rows(top_20_word_df_list[[1]], top_20_word_df_list[[2]], top_20_word_df_list[[3]])
colnames(combined_top_20_df) <- c('frequency', 'words', 'source')


ggplot(data = combined_top_20_df, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
  geom_col(aes(fill = source)) +
  geom_col(width = 0.7) +
  ylab('Frequency') + xlab('Top 20 common words') +
  scale_y_continuous(expand = c(0, 0)) +
  guides(fill = guide_legend(title = 'Source')) +
  theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))

  • Plot the % of top 10 words across all sources
no_common_word_dfm_list_percentage <- list()
no_common_word_df_list_percentage <- list()
for(i in 1:length(no_common_word_dfm_list)){
  no_common_word_dfm_list_percentage[[i]] <- dfm_weight(no_common_word_dfm_list[[i]], scheme = 'prop') * 100
  no_common_word_df_list_percentage[[i]] <- data.frame(topfeatures(no_common_word_dfm_list_percentage[[i]]))
  colnames(no_common_word_df_list_percentage[[i]]) <- 'percent'
  no_common_word_df_list_percentage[[i]]$words <- row.names(no_common_word_df_list_percentage[[i]])
}
no_common_word_df_list_percentage[[1]]$source <- 'blogs'
no_common_word_df_list_percentage[[2]]$source <- 'news'
no_common_word_df_list_percentage[[3]]$source <- 'tweets'

combined_percent_df_no_common_word <- bind_rows(no_common_word_df_list_percentage[[1]], no_common_word_df_list_percentage[[2]], no_common_word_df_list_percentage[[3]])

ggplot(data = combined_percent_df_no_common_word, aes(x = words, y = percent, group = source, colour = source)) +
  geom_line() +
  geom_point() +
  scale_x_discrete(limits = combined_percent_df_no_common_word$words) +
   theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    x="Words",
    y= "Percentage",
    color= "Source"
  )

2 grams

  • Identify 2 grams and make a dfm object
tokenised_word_list_2_gram <- list()
dfm_list_2_gram <- list()
for(i in 1:length(tokenised_word_list)){
  tokenised_word_list_2_gram[[i]] <- tokens_ngrams(tokenised_word_list[[i]], n = 2L, concatenator = ' ')
  dfm_list_2_gram[[i]] <- dfm(tokenised_word_list_2_gram[[i]], tolower = TRUE)
}
  • Calculate top 20 most frequent 2 grams and extract frequencies from individual source
top_20_word_list_2gram <- list()
top_20_word_df_list_2gram <- list()

for(i in 1:length(no_common_word_dfm_list)){
  top_20_word_list_2gram[[i]] <- topfeatures(dfm_list_2_gram[[i]], n=20)
  top_20_word_df_list_2gram[[i]] <- as.data.frame(top_20_word_list_2gram[[i]])
  top_20_word_df_list_2gram[[i]]$words <- row.names(top_20_word_df_list_2gram[[i]])
}
top_20_word_df_list_2gram[[1]]$source <- "blogs"
top_20_word_df_list_2gram[[2]]$source <- "news"
top_20_word_df_list_2gram[[3]]$source <- "tweets"

combined_top_20_df_2gram <- bind_rows(top_20_word_df_list_2gram[[1]], top_20_word_df_list_2gram[[2]], top_20_word_df_list_2gram[[3]])
colnames(combined_top_20_df_2gram) <- c('frequency', 'words', 'source')


ggplot(data = combined_top_20_df_2gram, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
  geom_col(aes(fill = source)) +
  geom_col(width = 0.7) +
  ylab('Frequency') + xlab('Top 20 common words') +
  scale_y_continuous(expand = c(0, 0)) +
  guides(fill = guide_legend(title = 'Source')) +
  theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))

3 grams

  • 3 gram dfm Unfortunately I repeatedly run out of memory (it needs > 12Gb) at this point even after freeing up memory using the rm() function (removing all objects except tokenised_word_list) the below code is commented out for knitting and show without results as it is very similar to the code for 2 grams
#tokenised_word_list_3_gram <- list()
#dfm_list_3_gram <- list()
#for(i in 1:length(tokenised_word_list)){
#  tokenised_word_list_3_gram[[i]] <- tokens_ngrams(tokenised_word_list[[i]], n = 3L, concatenator = ' ')
#  dfm_list_3_gram[[i]] <- dfm(tokenised_word_list_3_gram[[i]], tolower = TRUE)
#}
  • Calculate top 20 most frequent 3 grams and extract frequencies from individual source
#top_20_word_list_3gram <- list()
#top_20_word_df_list_3gram <- list()
#
#for(i in 1:length(no_common_word_dfm_list)){
#  top_20_word_list_3gram[[i]] <- topfeatures(dfm_list_3_gram[[i]], n=20)
#  top_20_word_df_list_3gram[[i]] <- as.data.frame(top_20_word_list_3gram[[i]])
#  top_20_word_df_list_3gram[[i]]$words <- row.names(top_20_word_df_list_3gram[[i]])
#}
#top_20_word_df_list_3gram[[1]]$source <- "blogs"
#top_20_word_df_list_3gram[[2]]$source <- "news"
#top_20_word_df_list_3gram[[3]]$source <- "tweets"
#
#combined_top_20_df_3gram <- bind_rows(top_20_word_df_list_3gram[[1]], top_20_word_df_list_3gram[[2]], top_20_word_df_list_3gram[[3]])
#colnames(combined_top_20_df_3gram) <- c('frequency', 'words', 'source')
#

#ggplot(data = combined_top_20_df_3gram, aes(x = reorder(words, frequency), y = frequency, fill=source)) +
#  geom_col(aes(fill = source)) +
#  geom_col(width = 0.7) +
#  ylab('Frequency') + xlab('Top 20 common words') +
#  scale_y_continuous(expand = c(0, 0)) +
#  guides(fill = guide_legend(title = 'Source')) +
#  theme(legend.position = 'top', axis.text.x = element_text(angle = 45, hjust = 1))

Word Coverage

word_coverage <- function(input_dfm_list, target_coverage){
  df_feature_list <- list()
  return_list <- list()
  list_length <- length(input_dfm_list)
  for(i in 1:list_length){
    df_feature_list[[i]] <- data.frame(topfeatures(input_dfm_list[[i]]), nfeat(input_dfm_list[[i]]))
    names(df_feature_list[[i]]) <- c("count", "total")
    df_feature_list[[i]]$words <- row.names(df_feature_list[[i]])
    df_feature_list[[i]]$percentage <- cumsum(df_feature_list[[i]]$count)/sum(df_feature_list[[i]]$count) * 100
    number_words <- min(which(round(df_feature_list[[i]]$percentage) >= target_coverage)) 
    return_list[[i]] <- c(number_words, df_feature_list[[i]][number_words,])
  }
  return(return_list)  
}
  • The number of words required to reach 50% or 90% coverage if all words are included
word_coverage(word_dfm_list, 50) #50%
## [[1]]
## [[1]][[1]]
## [1] 4
## 
## [[1]]$count
## [1] 897901
## 
## [[1]]$total
## [1] 394276
## 
## [[1]]$words
## [1] "a"
## 
## [[1]]$percentage
## [1] 58.13065
## 
## 
## [[2]]
## [[2]][[1]]
## [1] 3
## 
## [[2]]$count
## [1] 884894
## 
## [[2]]$total
## [1] 351172
## 
## [[2]]$words
## [1] "and"
## 
## [[2]]$percentage
## [1] 51.28499
## 
## 
## [[3]]
## [[3]][[1]]
## [1] 4
## 
## [[3]]$count
## [1] 609415
## 
## [[3]]$total
## [1] 423957
## 
## [[3]]$words
## [1] "a"
## 
## [[3]]$percentage
## [1] 55.33224
word_coverage(word_dfm_list, 90) #90%
## [[1]]
## [[1]][[1]]
## [1] 8
## 
## [[1]]$count
## [1] 460355
## 
## [[1]]$total
## [1] 394276
## 
## [[1]]$words
## [1] "that"
## 
## [[1]]$percentage
## [1] 90.11554
## 
## 
## [[2]]
## [[2]][[1]]
## [1] 8
## 
## [[2]]$count
## [1] 346961
## 
## [[2]]$total
## [1] 351172
## 
## [[2]]$words
## [1] "that"
## 
## [[2]]$percentage
## [1] 92.48097
## 
## 
## [[3]]
## [[3]][[1]]
## [1] 9
## 
## [[3]]$count
## [1] 359138
## 
## [[3]]$total
## [1] 423957
## 
## [[3]]$words
## [1] "of"
## 
## [[3]]$percentage
## [1] 93.50452
  • The number of words required to reach 50% or 90% coverage if stop words are removed
word_coverage(no_common_word_dfm_list, 50) #50%
## [[1]]
## [[1]][[1]]
## [1] 5
## 
## [[1]]$count
## [1] 99194
## 
## [[1]]$total
## [1] 316494
## 
## [[1]]$words
## [1] "can"
## 
## [[1]]$percentage
## [1] 58.00009
## 
## 
## [[2]]
## [[2]][[1]]
## [1] 3
## 
## [[2]]$count
## [1] 86632
## 
## [[2]]$total
## [1] 278387
## 
## [[2]]$words
## [1] "one"
## 
## [[2]]$percentage
## [1] 49.93069
## 
## 
## [[3]]
## [[3]][[1]]
## [1] 5
## 
## [[3]]$count
## [1] 128234
## 
## [[3]]$total
## [1] 353012
## 
## [[3]]$words
## [1] "go"
## 
## [[3]]$percentage
## [1] 57.16677
word_coverage(no_common_word_dfm_list, 90) #90%
## [[1]]
## [[1]][[1]]
## [1] 9
## 
## [[1]]$count
## [1] 70847
## 
## [[1]]$total
## [1] 316494
## 
## [[1]]$words
## [1] "day"
## 
## [[1]]$percentage
## [1] 92.67862
## 
## 
## [[2]]
## [[2]][[1]]
## [1] 9
## 
## [[2]]$count
## [1] 60014
## 
## [[2]]$total
## [1] 278387
## 
## [[2]]$words
## [1] "get"
## 
## [[2]]$percentage
## [1] 93.30444
## 
## 
## [[3]]
## [[3]][[1]]
## [1] 9
## 
## [[3]]$count
## [1] 90152
## 
## [[3]]$total
## [1] 353012
## 
## [[3]]$words
## [1] "can"
## 
## [[3]]$percentage
## [1] 92.54845

Conclusions and discussions

Future plans