1 Import Data

Install and load required packages:

# Load Packages
suppressPackageStartupMessages(library(pacman))
p_load(tm, tidyverse, rebus, stringr, wordcloud, class, qdap, readtext, quanteda, stringi, filehash, RWeka, RColorBrewer)
# devtools::install_github("quanteda/quanteda.corpora")
# devtools::install_github("quanteda/quanteda.dictionaries")

Download and import text files:

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
ifelse(test = !file.exists("Coursera-SwiftKey.zip"), 
       yes = download.file(url = url, destfile = "Coursera-SwiftKey.zip", quiet = TRUE), 
       no = unzip(zipfile = "Coursera-SwiftKey.zip")
       )
[1] "./final/en_US/en_US.twitter.txt"
rm(url)
# file.remove("Coursera-SwiftKey.zip") # delete file

# Import English text files
twitter_en <- readLines(con = "en_US.twitter.txt", encoding = "UTF-8")
news_en <- readLines(con = "en_US.news.txt", encoding = "UTF-8")
blogs_en <- readLines(con = "en_US.blogs.txt", encoding = "UTF-8")

Calculate and print summary statistics for each file:

Files <- dir(path = "final/en_US/", full.names=TRUE)
Summary_Statistics <- data.frame(
           File = basename(Files), 
           `File Size (MB)` = round(file.size(Files)/(1024*1024),1), # Calculate file size (in MB)
           Documents = sapply(X = list(twitter_en, news_en, blogs_en), FUN = length), stringsAsFactors = FALSE #, # Calculate number of documents
           # Words = sapply(X = list(twitter_en, news_en, blogs_en), FUN = function(x) sum(word_count(x), na.rm = TRUE)) # Calculate number of words
           )

colnames(Summary_Statistics) <- c("File", "File Size (MB)", "Documents")
# Print Statistics
Summary_Statistics
# Remove obsolete objects
rm(Summary_Statistics)

Remove handles, tags and RT from Twitter dataset:

# Clean Twitter Data
# Remove Twitter Handles
twitter_en <- gsub(pattern = "(^|[^@\\w])@(\\w{1,15})\\b", replacement = "", x = twitter_en) 
# Remove Twitter Tags
twitter_en <- gsub(pattern = "#\\S+", replacement = "", x = twitter_en) 
# Remove RT references
twitter_en <- gsub(pattern = "^RT", replacement = " ", x = twitter_en) 

Merge all datasets together:

# Merge all sources in one object
en_data <- c(twitter_en, news_en, blogs_en)
save(en_data, file = "en_data.RData")

# Remove obsolete objects
rm(twitter_en, news_en, blogs_en)

2 Create training, test and validation sets

Due to computational constraints we are only able to use a small fraction of the documents available.

We will partition 50% of the selected text for the training set and 25% for the test and validation sets.

set.seed(1)
# Reduce size of data
# load(file = "en_data.RData")
fraction <- 0.005
en_data_abridged <- sample(x = en_data, size = length(en_data)*fraction, replace = FALSE) 

# Create index for training set data
training_index <- sample(x = 1:length(en_data_abridged), size = length(en_data_abridged)*0.50, replace = FALSE) # [TD]

# Create Training Set (50% of documents)
training_en <- en_data_abridged[training_index]

# Split testing data into testing and validation set
testing_en <- en_data_abridged[-training_index]

# Create index for test set data
testing_index <- sample(x = 1:length(testing_en), size = length(testing_en)*0.50, replace = FALSE) # [TD]

# Create Validation Set (50/2 = 25% of documents)
validation_en <- testing_en[-testing_index]

# Create Test Set (50/2 = 25% of documents)
testing_en <- testing_en[testing_index]

# head(training_en)
# head(validation_en)
# head(testing_en)
# length(training_en)
# length(validation_en)
# length(testing_en)

# length(validation_en) + length(testing_en) + length(training_en) == length(en_data)

# Save files
saveRDS(object = training_en, file = "training_en.rds")
saveRDS(testing_en, file = "testing_en.rds")
saveRDS(validation_en, file = "validation_en.rds")

# Remove obsolete objects
rm(validation_en, testing_en, training_en, fraction, en_data)

3 Cleaning and Processing of Text Data

3.1 Create and Clean Corpus

Create corpus of the merged source of training English text:

training_en <- readRDS(file = "training_en.rds")

# Change encoding
training_en <- iconv(training_en, 'UTF-8', 'ASCII')
training_en <- str_replace_all(training_en,"[^[:graph:]]", " ") 

# head(training_en, 25)
# length(training_en)

# Remove NA values
training_en <- training_en[!is.na(training_en)]

# Create volatile corpus
training_corpus <- training_en %>% 
  VectorSource() %>% 
  VCorpus()

# class(training_corpus)

The corpus created can be cleaned by using the tm_map function.

We will use this function to:
* Remove extra white spaces * Remove punctuation * Remove numbers * Remove profanity * Lowercase all terms

# Import list of profanity terms
profanity <- readLines("Profanity.txt") # Source: https://www.freewebheaders.com/download/files/base-list-of-bad-words_text-file_2018_07_30.zip

# Create function to clean corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, stripWhitespace) # remove extra white spaces
  # corpus <- tm_map(corpus, content_transformer(replace_abbreviation)) # Replace abbreviations
  corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE) # remove punctuation, keep contractions and intra-word dashes
  corpus <- tm_map(corpus, removeNumbers) # remove numbers
  corpus <- tm_map(corpus, removeWords, c(profanity, "RT")) # remove profanity from corpus and retweet reference
  corpus <- tm_map(corpus, content_transformer(tolower)) # lower case terms
  # corpus <- tm_map(corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
  return(corpus)
}

save(clean_corpus, file = "clean_corpus.RData")

# Process Corpus
training_corpus_clean <- clean_corpus(training_corpus)
save(training_corpus_clean, file = "training_corpus_clean.RData")

# training_corpus_clean_enc <- tm_map(training_corpus_clean, function(x) iconv(enc2utf8(x), sub = "byte"))
# training_corpus_clean_usable <- str_replace_all(training_corpus_clean,"[^[:graph:]]", " ")

# training_corpus_clean[[15]][1]
# class(training_corpus_clean)

3.2 Create Term Document Matrix (TDM)

With a processed corpus we can create a term document matrix (TDM) or a document term matrix (DTM).

# Create Term Document Matrix
training_tdm <- TermDocumentMatrix(x = training_corpus_clean)

# Remove Sparse Terms
sparsity_value <- 0.999
training_tdm_sparse <- removeSparseTerms(x = training_tdm, sparse = sparsity_value)
training_tdm_sparse
<<TermDocumentMatrix (terms: 1537, documents: 7401)>>
Non-/sparse entries: 63610/11311727
Sparsity           : 99%
Maximal term length: 15
Weighting          : term frequency (tf)
# Save TDM
saveRDS(training_tdm_sparse, file = "training_tdm_sparse.rds")

# Remove obsolete objects
rm(training_en, training_corpus, profanity, training_tdm)

4 N-gram Tokenization

We can use the NGramTokenizer function from the RWeka package to tokenize the documents. We will create objects with Unigrams, Bigrams, Trigrams and 4-Grams. We will also use the removeSparseTerms function to remove sparse terms.

# Create custom tokenizing functions for N-Grams
Tokenizer_1Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 1, max = 1))}
Tokenizer_2Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 2, max = 2))}
Tokenizer_3Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 3, max = 3))}
Tokenizer_4Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 4, max = 4))}
save(list = c("Tokenizer_1Gram", "Tokenizer_2Gram", "Tokenizer_3Gram", "Tokenizer_4Gram"), file = "Tokenizers_NGram.RData")

options(digits = 10)
# 1-Gram TDM
sparsity_value <- 0.9999
TDM_1Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_1Gram))
TDM_1Gram_sparse <- removeSparseTerms(TDM_1Gram_sparse, sparse = sparsity_value)
TDM_1Gram_sparse
<<TermDocumentMatrix (terms: 15489, documents: 7401)>>
Non-/sparse entries: 86924/114547165
Sparsity           : 100%
Maximal term length: 68
Weighting          : term frequency (tf)
save(TDM_1Gram_sparse, file = "TDM_1Gram_sparse.RData")

# 2-Gram TDM
sparsity_value <- 0.999
TDM_2Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_2Gram))
TDM_2Gram_sparse <- removeSparseTerms(TDM_2Gram_sparse, sparse = sparsity_value)
TDM_2Gram_sparse
<<TermDocumentMatrix (terms: 1322, documents: 7401)>>
Non-/sparse entries: 28253/9755869
Sparsity           : 100%
Maximal term length: 15
Weighting          : term frequency (tf)
save(TDM_2Gram_sparse, file = "TDM_2Gram_sparse.RData")

# 3-Gram TDM
sparsity_value <- 0.999
TDM_3Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_3Gram))
TDM_3Gram_sparse <- removeSparseTerms(TDM_3Gram_sparse, sparse = sparsity_value)
TDM_3Gram_sparse
<<TermDocumentMatrix (terms: 179, documents: 7401)>>
Non-/sparse entries: 2443/1322336
Sparsity           : 100%
Maximal term length: 20
Weighting          : term frequency (tf)
save(TDM_3Gram_sparse, file = "TDM_3Gram_sparse.RData")

# 4-Gram TDM
sparsity_value <- 0.9995
TDM_4Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_4Gram))
TDM_4Gram_sparse <- removeSparseTerms(TDM_4Gram_sparse, sparse = sparsity_value)
TDM_4Gram_sparse
<<TermDocumentMatrix (terms: 76, documents: 7401)>>
Non-/sparse entries: 461/562015
Sparsity           : 100%
Maximal term length: 21
Weighting          : term frequency (tf)
save(TDM_4Gram_sparse, file = "TDM_4Gram_sparse.RData")

# Remove obsolete objects
rm(Tokenizer_1Gram, Tokenizer_2Gram, Tokenizer_3Gram, Tokenizer_4Gram)

5 Exploratory Data Analysis

5.2 Unigram

Print table with summary of most popular Unigrams.

options(digits = 2)

TDM_1Gram_matrix <- as.matrix(TDM_1Gram_sparse)
TDM_1Gram_Values <- sort(rowSums(TDM_1Gram_matrix), decreasing = TRUE)
TDM_1Gram_DF <- data.frame(Term = names(TDM_1Gram_Values), Count = TDM_1Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_1Gram_DF <- TDM_1Gram_DF %>% mutate(Frequency = Count/sum(Count) %>% round(1))
TDM_1Gram_DF[1:25, ]
pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_1Gram_DF$Term, freq = TDM_1Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_1Gram_matrix, TDM_1Gram_Values)
TDM_1Gram_DF %>% 
  mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Unigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())


5.3 Bigram

TDM_2Gram_matrix <- as.matrix(TDM_2Gram_sparse)
TDM_2Gram_Values <- sort(rowSums(TDM_2Gram_matrix), decreasing = TRUE)
TDM_2Gram_DF <- data.frame(Term = names(TDM_2Gram_Values), Count = TDM_2Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_2Gram_DF <- TDM_2Gram_DF %>% mutate(Frequency = Count/sum(Count))
TDM_2Gram_DF[1:25, ]
pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_2Gram_DF$Term, freq = TDM_2Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_2Gram_matrix, TDM_2Gram_Values)
TDM_2Gram_DF %>% 
  mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Bigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())


5.4 Trigram

TDM_3Gram_matrix <- as.matrix(TDM_3Gram_sparse)
TDM_3Gram_Values <- sort(rowSums(TDM_3Gram_matrix), decreasing = TRUE)
TDM_3Gram_DF <- data.frame(Term = names(TDM_3Gram_Values), Count = TDM_3Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_3Gram_DF <- TDM_3Gram_DF %>% mutate(Frequency = Count/sum(Count))

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_3Gram_DF$Term, freq = TDM_3Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_3Gram_matrix, TDM_3Gram_Values)
TDM_3Gram_DF %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Trigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())


5.5 4-Gram

TDM_4Gram_matrix <- as.matrix(TDM_4Gram_sparse)
TDM_4Gram_Values <- sort(rowSums(TDM_4Gram_matrix), decreasing = TRUE)
TDM_4Gram_DF <- data.frame(Term = names(TDM_4Gram_Values), Count = TDM_4Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_4Gram_DF <- TDM_4Gram_DF %>% mutate(Frequency = Count/sum(Count))

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_4Gram_DF$Term, freq = TDM_4Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_4Gram_matrix, TDM_4Gram_Values)
TDM_4Gram_DF %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 4-Grams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())

rm(TDM_1Gram_sparse, TDM_2Gram_sparse, TDM_3Gram_sparse, TDM_4Gram_sparse)

6 Predicting Words

word <- "the"

TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]
word <- "thank "

TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]
word <- "how are"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]
word <- "you have"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]
word <- "there is"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]