Install and load required packages:
# Load Packages
suppressPackageStartupMessages(library(pacman))
p_load(tm, tidyverse, rebus, stringr, wordcloud, class, qdap, readtext, quanteda, stringi, filehash, RWeka, RColorBrewer)
# devtools::install_github("quanteda/quanteda.corpora")
# devtools::install_github("quanteda/quanteda.dictionaries")Download and import text files:
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
ifelse(test = !file.exists("Coursera-SwiftKey.zip"),
yes = download.file(url = url, destfile = "Coursera-SwiftKey.zip", quiet = TRUE),
no = unzip(zipfile = "Coursera-SwiftKey.zip")
)[1] "./final/en_US/en_US.twitter.txt"
rm(url)
# file.remove("Coursera-SwiftKey.zip") # delete file
# Import English text files
twitter_en <- readLines(con = "en_US.twitter.txt", encoding = "UTF-8")
news_en <- readLines(con = "en_US.news.txt", encoding = "UTF-8")
blogs_en <- readLines(con = "en_US.blogs.txt", encoding = "UTF-8")Calculate and print summary statistics for each file:
Files <- dir(path = "final/en_US/", full.names=TRUE)
Summary_Statistics <- data.frame(
File = basename(Files),
`File Size (MB)` = round(file.size(Files)/(1024*1024),1), # Calculate file size (in MB)
Documents = sapply(X = list(twitter_en, news_en, blogs_en), FUN = length), stringsAsFactors = FALSE #, # Calculate number of documents
# Words = sapply(X = list(twitter_en, news_en, blogs_en), FUN = function(x) sum(word_count(x), na.rm = TRUE)) # Calculate number of words
)
colnames(Summary_Statistics) <- c("File", "File Size (MB)", "Documents")
# Print Statistics
Summary_Statistics# Remove obsolete objects
rm(Summary_Statistics)Remove handles, tags and RT from Twitter dataset:
# Clean Twitter Data
# Remove Twitter Handles
twitter_en <- gsub(pattern = "(^|[^@\\w])@(\\w{1,15})\\b", replacement = "", x = twitter_en)
# Remove Twitter Tags
twitter_en <- gsub(pattern = "#\\S+", replacement = "", x = twitter_en)
# Remove RT references
twitter_en <- gsub(pattern = "^RT", replacement = " ", x = twitter_en) Merge all datasets together:
# Merge all sources in one object
en_data <- c(twitter_en, news_en, blogs_en)
save(en_data, file = "en_data.RData")
# Remove obsolete objects
rm(twitter_en, news_en, blogs_en)Due to computational constraints we are only able to use a small fraction of the documents available.
We will partition 50% of the selected text for the training set and 25% for the test and validation sets.
set.seed(1)
# Reduce size of data
# load(file = "en_data.RData")
fraction <- 0.005
en_data_abridged <- sample(x = en_data, size = length(en_data)*fraction, replace = FALSE)
# Create index for training set data
training_index <- sample(x = 1:length(en_data_abridged), size = length(en_data_abridged)*0.50, replace = FALSE) # [TD]
# Create Training Set (50% of documents)
training_en <- en_data_abridged[training_index]
# Split testing data into testing and validation set
testing_en <- en_data_abridged[-training_index]
# Create index for test set data
testing_index <- sample(x = 1:length(testing_en), size = length(testing_en)*0.50, replace = FALSE) # [TD]
# Create Validation Set (50/2 = 25% of documents)
validation_en <- testing_en[-testing_index]
# Create Test Set (50/2 = 25% of documents)
testing_en <- testing_en[testing_index]
# head(training_en)
# head(validation_en)
# head(testing_en)
# length(training_en)
# length(validation_en)
# length(testing_en)
# length(validation_en) + length(testing_en) + length(training_en) == length(en_data)
# Save files
saveRDS(object = training_en, file = "training_en.rds")
saveRDS(testing_en, file = "testing_en.rds")
saveRDS(validation_en, file = "validation_en.rds")
# Remove obsolete objects
rm(validation_en, testing_en, training_en, fraction, en_data)Create corpus of the merged source of training English text:
training_en <- readRDS(file = "training_en.rds")
# Change encoding
training_en <- iconv(training_en, 'UTF-8', 'ASCII')
training_en <- str_replace_all(training_en,"[^[:graph:]]", " ")
# head(training_en, 25)
# length(training_en)
# Remove NA values
training_en <- training_en[!is.na(training_en)]
# Create volatile corpus
training_corpus <- training_en %>%
VectorSource() %>%
VCorpus()
# class(training_corpus)The corpus created can be cleaned by using the tm_map function.
We will use this function to:
* Remove extra white spaces * Remove punctuation * Remove numbers * Remove profanity * Lowercase all terms
# Import list of profanity terms
profanity <- readLines("Profanity.txt") # Source: https://www.freewebheaders.com/download/files/base-list-of-bad-words_text-file_2018_07_30.zip
# Create function to clean corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace) # remove extra white spaces
# corpus <- tm_map(corpus, content_transformer(replace_abbreviation)) # Replace abbreviations
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE) # remove punctuation, keep contractions and intra-word dashes
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, c(profanity, "RT")) # remove profanity from corpus and retweet reference
corpus <- tm_map(corpus, content_transformer(tolower)) # lower case terms
# corpus <- tm_map(corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
return(corpus)
}
save(clean_corpus, file = "clean_corpus.RData")
# Process Corpus
training_corpus_clean <- clean_corpus(training_corpus)
save(training_corpus_clean, file = "training_corpus_clean.RData")
# training_corpus_clean_enc <- tm_map(training_corpus_clean, function(x) iconv(enc2utf8(x), sub = "byte"))
# training_corpus_clean_usable <- str_replace_all(training_corpus_clean,"[^[:graph:]]", " ")
# training_corpus_clean[[15]][1]
# class(training_corpus_clean)With a processed corpus we can create a term document matrix (TDM) or a document term matrix (DTM).
# Create Term Document Matrix
training_tdm <- TermDocumentMatrix(x = training_corpus_clean)
# Remove Sparse Terms
sparsity_value <- 0.999
training_tdm_sparse <- removeSparseTerms(x = training_tdm, sparse = sparsity_value)
training_tdm_sparse<<TermDocumentMatrix (terms: 1537, documents: 7401)>>
Non-/sparse entries: 63610/11311727
Sparsity : 99%
Maximal term length: 15
Weighting : term frequency (tf)
# Save TDM
saveRDS(training_tdm_sparse, file = "training_tdm_sparse.rds")
# Remove obsolete objects
rm(training_en, training_corpus, profanity, training_tdm)We can use the NGramTokenizer function from the RWeka package to tokenize the documents. We will create objects with Unigrams, Bigrams, Trigrams and 4-Grams. We will also use the removeSparseTerms function to remove sparse terms.
# Create custom tokenizing functions for N-Grams
Tokenizer_1Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 1, max = 1))}
Tokenizer_2Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 2, max = 2))}
Tokenizer_3Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 3, max = 3))}
Tokenizer_4Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 4, max = 4))}
save(list = c("Tokenizer_1Gram", "Tokenizer_2Gram", "Tokenizer_3Gram", "Tokenizer_4Gram"), file = "Tokenizers_NGram.RData")
options(digits = 10)
# 1-Gram TDM
sparsity_value <- 0.9999
TDM_1Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_1Gram))
TDM_1Gram_sparse <- removeSparseTerms(TDM_1Gram_sparse, sparse = sparsity_value)
TDM_1Gram_sparse<<TermDocumentMatrix (terms: 15489, documents: 7401)>>
Non-/sparse entries: 86924/114547165
Sparsity : 100%
Maximal term length: 68
Weighting : term frequency (tf)
save(TDM_1Gram_sparse, file = "TDM_1Gram_sparse.RData")
# 2-Gram TDM
sparsity_value <- 0.999
TDM_2Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_2Gram))
TDM_2Gram_sparse <- removeSparseTerms(TDM_2Gram_sparse, sparse = sparsity_value)
TDM_2Gram_sparse<<TermDocumentMatrix (terms: 1322, documents: 7401)>>
Non-/sparse entries: 28253/9755869
Sparsity : 100%
Maximal term length: 15
Weighting : term frequency (tf)
save(TDM_2Gram_sparse, file = "TDM_2Gram_sparse.RData")
# 3-Gram TDM
sparsity_value <- 0.999
TDM_3Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_3Gram))
TDM_3Gram_sparse <- removeSparseTerms(TDM_3Gram_sparse, sparse = sparsity_value)
TDM_3Gram_sparse<<TermDocumentMatrix (terms: 179, documents: 7401)>>
Non-/sparse entries: 2443/1322336
Sparsity : 100%
Maximal term length: 20
Weighting : term frequency (tf)
save(TDM_3Gram_sparse, file = "TDM_3Gram_sparse.RData")
# 4-Gram TDM
sparsity_value <- 0.9995
TDM_4Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_4Gram))
TDM_4Gram_sparse <- removeSparseTerms(TDM_4Gram_sparse, sparse = sparsity_value)
TDM_4Gram_sparse<<TermDocumentMatrix (terms: 76, documents: 7401)>>
Non-/sparse entries: 461/562015
Sparsity : 100%
Maximal term length: 21
Weighting : term frequency (tf)
save(TDM_4Gram_sparse, file = "TDM_4Gram_sparse.RData")
# Remove obsolete objects
rm(Tokenizer_1Gram, Tokenizer_2Gram, Tokenizer_3Gram, Tokenizer_4Gram)Print some of the most frequent N-Grams:
min_freq <- 50
# Print 1-Grams with a minimum of least n occurences
TDM_1Gram_sparse %>% findFreqTerms(lowfreq = min_freq) [1] "about" "actually" "after" "again" "all"
[6] "already" "also" "always" "amazing" "and"
[11] "another" "any" "anything" "are" "around"
[16] "away" "awesome" "back" "bad" "because"
[21] "been" "before" "being" "believe" "best"
[26] "better" "big" "birthday" "blog" "book"
[31] "both" "but" "call" "can" "check"
[36] "come" "coming" "could" "day" "days"
[41] "did" "didn" "does" "doesn" "doing"
[46] "don" "done" "down" "each" "end"
[51] "even" "ever" "every" "everyone" "excited"
[56] "family" "feel" "few" "find" "first"
[61] "follow" "food" "for" "found" "free"
[66] "friends" "from" "fun" "game" "get"
[71] "getting" "girl" "give" "god" "going"
[76] "gonna" "good" "got" "great" "guys"
[81] "had" "haha" "happy" "hard" "has"
[86] "hate" "have" "having" "hear" "help"
[91] "her" "here" "hey" "him" "his"
[96] "home" "hope" "house" "how" "into"
[101] "its" "just" "keep" "kids" "know"
[106] "last" "let" "life" "like" "little"
[111] "live" "lol" "long" "look" "looking"
[116] "lot" "love" "made" "make" "making"
[121] "man" "many" "may" "maybe" "might"
[126] "more" "morning" "most" "much" "music"
[131] "must" "need" "never" "new" "next"
[136] "nice" "night" "not" "now" "off"
[141] "old" "one" "only" "other" "our"
[146] "out" "over" "own" "party" "people"
[151] "place" "play" "please" "post" "put"
[156] "ready" "real" "really" "right" "said"
[161] "same" "say" "school" "see" "she"
[166] "should" "show" "since" "some" "someone"
[171] "something" "soon" "start" "still" "stop"
[176] "such" "summer" "sure" "take" "talk"
[181] "tell" "than" "thank" "thanks" "that"
[186] "the" "their" "them" "then" "there"
[191] "these" "they" "thing" "things" "think"
[196] "this" "those" "though" "through" "time"
[201] "today" "tomorrow" "tonight" "too" "try"
[206] "trying" "tweet" "twitter" "two" "until"
[211] "use" "used" "very" "wait" "want"
[216] "was" "watching" "way" "week" "weekend"
[221] "well" "went" "were" "what" "when"
[226] "where" "which" "while" "who" "why"
[231] "will" "win" "wish" "with" "without"
[236] "work" "working" "world" "would" "year"
[241] "years" "yes" "yet" "you" "your"
# Print 2-Grams with a minimum of least n occurences
TDM_2Gram_sparse %>% findFreqTerms(lowfreq = min_freq) [1] "a few" "a good" "a great" "a little" "a lot"
[6] "all of" "all the" "and a" "and i" "and the"
[11] "as a" "at the" "be a" "but i" "by the"
[16] "can t" "didn t" "do you" "doesn t" "don t"
[21] "for a" "for the" "from the" "going to" "has been"
[26] "have a" "have been" "have to" "i am" "i can"
[31] "i don" "i had" "i have" "i just" "i know"
[36] "i ll" "i love" "i m" "i think" "i ve"
[41] "i was" "i will" "i would" "if i" "if you"
[46] "in a" "in my" "in the" "is a" "is the"
[51] "it is" "it s" "it was" "need to" "of a"
[56] "of my" "of the" "on a" "on my" "on the"
[61] "one of" "out of" "s a" "so i" "so much"
[66] "thank you" "thanks for" "that i" "that is" "that s"
[71] "that the" "the best" "the first" "the most" "the same"
[76] "this is" "to a" "to be" "to do" "to get"
[81] "to go" "to have" "to make" "to see" "to the"
[86] "want to" "was a" "we are" "we have" "when i"
[91] "when you" "will be" "with a" "with the" "you are"
[96] "you can" "you have" "you re"
# Print 3-Grams with a minimum of least n occurences
TDM_3Gram_sparse %>% findFreqTerms(lowfreq = 10) [1] "a bit of" "a couple of" "a great day"
[4] "a lot of" "all of the" "and i m"
[7] "at the end" "back to the" "be able to"
[10] "but i m" "can t believe" "can t wait"
[13] "d like to" "do you think" "don t forget"
[16] "don t have" "don t know" "end of the"
[19] "for the follow" "go to the" "going to be"
[22] "have a great" "i am not" "i can t"
[25] "i didn t" "i don t" "i feel like"
[28] "i guess i" "i had a" "i have a"
[31] "i have been" "i have the" "i have to"
[34] "i haven t" "i know i" "i ll be"
[37] "i love you" "i m a" "i m going"
[40] "i m gonna" "i m in" "i m just"
[43] "i m not" "i m so" "i m still"
[46] "i m sure" "i need a" "i need to"
[49] "i think i" "i ve been" "i want to"
[52] "i wanted to" "i will be" "i wish i"
[55] "if you are" "if you can" "if you have"
[58] "if you re" "in order to" "in the world"
[61] "is going to" "is not a" "is one of"
[64] "it has been" "it is a" "it s a"
[67] "it s been" "it s just" "it s not"
[70] "it s the" "it was a" "let me know"
[73] "let s go" "looking forward to" "m going to"
[76] "mother s day" "of my favorite" "of the day"
[79] "of the year" "one of my" "one of the"
[82] "out of the" "part of the" "rest of the"
[85] "s going to" "so i m" "some of the"
[88] "t wait for" "t wait to" "t want to"
[91] "thank you for" "thanks for following" "thanks for the"
[94] "the end of" "the fact that" "the first time"
[97] "the rest of" "there is a" "there s a"
[100] "this is a" "to be a" "to be the"
[103] "to go to" "to have a" "to make a"
[106] "to see the" "want to be" "we don t"
[109] "we need to" "when i was" "when you re"
[112] "would love to" "you can t" "you don t"
[115] "you for the" "you go to" "you have a"
[118] "you have to" "you want to"
# Print 4-Grams with a minimum of least n occurences
TDM_4Gram_sparse %>% findFreqTerms(lowfreq = 5) [1] "at the end of" "at the same time"
[3] "can t wait for" "can t wait to"
[5] "don t have a" "don t know what"
[7] "don t think i" "don t want to"
[9] "for the shout out" "going to be a"
[11] "happy mother s day" "have a great day"
[13] "i am going to" "i can t believe"
[15] "i can t wait" "i d like to"
[17] "i d love to" "i don t have"
[19] "i don t know" "i don t think"
[21] "i m going to" "i think it s"
[23] "if you don t" "is going to be"
[25] "is one of the" "it s time to"
[27] "one of my favorite" "s going to be"
[29] "so much for the" "t wait to see"
[31] "thank you for the" "thank you so much"
[33] "thanks for the follow" "thanks so much for"
[35] "the end of the" "the rest of the"
[37] "you don t know" "you don t see"
[39] "you go to the"
Print table with summary of most popular Unigrams.
options(digits = 2)
TDM_1Gram_matrix <- as.matrix(TDM_1Gram_sparse)
TDM_1Gram_Values <- sort(rowSums(TDM_1Gram_matrix), decreasing = TRUE)
TDM_1Gram_DF <- data.frame(Term = names(TDM_1Gram_Values), Count = TDM_1Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_1Gram_DF <- TDM_1Gram_DF %>% mutate(Frequency = Count/sum(Count) %>% round(1))
TDM_1Gram_DF[1:25, ]pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_1Gram_DF$Term, freq = TDM_1Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)rm(TDM_1Gram_matrix, TDM_1Gram_Values)TDM_1Gram_DF %>%
mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>%
.[1:50,] %>%
arrange(desc(Count)) %>%
mutate() %>%
ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
geom_col(fill = "tan") +
coord_flip(expand = c(0.05,0)) +
theme_bw() +
labs(x = "Terms", title = "Top 50 Unigrams", subtitle = "Taken from a subset of the whole dataset") +
theme(axis.ticks = element_blank())TDM_2Gram_matrix <- as.matrix(TDM_2Gram_sparse)
TDM_2Gram_Values <- sort(rowSums(TDM_2Gram_matrix), decreasing = TRUE)
TDM_2Gram_DF <- data.frame(Term = names(TDM_2Gram_Values), Count = TDM_2Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_2Gram_DF <- TDM_2Gram_DF %>% mutate(Frequency = Count/sum(Count))
TDM_2Gram_DF[1:25, ]pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_2Gram_DF$Term, freq = TDM_2Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)rm(TDM_2Gram_matrix, TDM_2Gram_Values)TDM_2Gram_DF %>%
mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>%
.[1:50,] %>%
arrange(desc(Count)) %>%
mutate() %>%
ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
geom_col(fill = "tan") +
coord_flip(expand = c(0.05,0)) +
theme_bw() +
labs(x = "Terms", title = "Top 50 Bigrams", subtitle = "Taken from a subset of the whole dataset") +
theme(axis.ticks = element_blank())TDM_3Gram_matrix <- as.matrix(TDM_3Gram_sparse)
TDM_3Gram_Values <- sort(rowSums(TDM_3Gram_matrix), decreasing = TRUE)
TDM_3Gram_DF <- data.frame(Term = names(TDM_3Gram_Values), Count = TDM_3Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_3Gram_DF <- TDM_3Gram_DF %>% mutate(Frequency = Count/sum(Count))
pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_3Gram_DF$Term, freq = TDM_3Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)rm(TDM_3Gram_matrix, TDM_3Gram_Values)TDM_3Gram_DF %>%
.[1:50,] %>%
arrange(desc(Count)) %>%
mutate() %>%
ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
geom_col(fill = "tan") +
coord_flip(expand = c(0.05,0)) +
theme_bw() +
labs(x = "Terms", title = "Top 50 Trigrams", subtitle = "Taken from a subset of the whole dataset") +
theme(axis.ticks = element_blank())TDM_4Gram_matrix <- as.matrix(TDM_4Gram_sparse)
TDM_4Gram_Values <- sort(rowSums(TDM_4Gram_matrix), decreasing = TRUE)
TDM_4Gram_DF <- data.frame(Term = names(TDM_4Gram_Values), Count = TDM_4Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_4Gram_DF <- TDM_4Gram_DF %>% mutate(Frequency = Count/sum(Count))
pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_4Gram_DF$Term, freq = TDM_4Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)rm(TDM_4Gram_matrix, TDM_4Gram_Values)TDM_4Gram_DF %>%
.[1:50,] %>%
arrange(desc(Count)) %>%
mutate() %>%
ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
geom_col(fill = "tan") +
coord_flip(expand = c(0.05,0)) +
theme_bw() +
labs(x = "Terms", title = "Top 50 4-Grams", subtitle = "Taken from a subset of the whole dataset") +
theme(axis.ticks = element_blank())rm(TDM_1Gram_sparse, TDM_2Gram_sparse, TDM_3Gram_sparse, TDM_4Gram_sparse)word <- "the"
TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]word <- "thank "
TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]word <- "how are"
TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]word <- "you have"
TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]word <- "there is"
TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]