This report contains a basic summary and exploratory analysis of the SwiftKey data set that I performed to better understand the distribution and relationship between the words, tokens, and phrases in the text. My end goal is to produce a word prediction algorithm and Shiny app using the data set.
blogs.df <- create.text.df("en_US.blogs.txt", .015)
twits.df <- create.text.df("en_US.twitter.txt", .015)
news.df <- create.text.df("en_US.news.txt", .015)
Note that only 1.5% of the total data set was sampled to reduce the resources required to run the code.
combined.df <- rbind(twits.df, news.df, blogs.df)
In this step I removed contractions, URLs, special characters, control characters, white spaces, alphanumeric characters, numbers, capitalization, common words, punctuation, and otherwise preparing the corpus for analysis.
clean.combined.df <- as.data.frame(txt.cleaning(combined.df$text))
colnames(clean.combined.df)[1] <- "text"
profanity <- readLines('badWords.txt', skipNul = T)
clean.combined.df %>%
unnest_tokens(word, text) %>%
filter(!word %in% profanity)
In this step the combined data set was split into training and test data sets using a 80/20 split. The data frames were then converted into the corpora.
set.seed(1) #make this example reproducible
clean.combined.df$id <- 1:nrow(combined.df) #create ID column
train.df <- clean.combined.df %>% dplyr::sample_frac(0.80) #use 80% of dataset as training set
test.df <- dplyr::anti_join(clean.combined.df, train.df, by = 'id') #and 20% as test set
train.corpus <- Corpus(VectorSource(train.df$text))
test.corpus <- Corpus(VectorSource(test.df$text))
writeLines(as.character(train.corpus), con= "clean.training.corpus.txt")
writeLines(as.character(test.corpus), con= "clean.test.corpus.txt")
summary(train.corpus$content)
## Length Class Mode
## 51235 character character
length(train.corpus$content)
## [1] 51235
wordcount(train.corpus$content)
## [1] 1202256
text_unigrams <- unigrams_tokenizer(train.corpus$content, remove.stopwords = T)
head(text_unigrams)
## # A tibble: 6 × 2
## ngram n
## <chr> <int>
## 1 said 3700
## 2 just 3642
## 3 one 3515
## 4 like 3202
## 5 im 2965
## 6 can 2922
library(ggplot2)
text_unigrams %>% top_n(10) %>% ggplot(aes(ngram, n)) + geom_col() + xlab(NULL) + coord_flip()
library(wordcloud)
set.seed(1234)
wordcloud(words = text_unigrams$ngram, freq = text_unigrams$n, min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
text_bigrams <- bigrams_tokenizer(train.corpus$content, remove.stopwords = TRUE)
head(text_bigrams)
## # A tibble: 6 × 2
## ngram n
## <chr> <int>
## 1 right now 294
## 2 new york 267
## 3 dont know 250
## 4 last year 225
## 5 cant wait 203
## 6 high school 183
library(ggplot2)
text_bigrams %>% top_n(10) %>% ggplot(aes(ngram, n)) + geom_col() + xlab(NULL) + coord_flip()
# Generate word cloud
set.seed(1234)
wordcloud(words = text_bigrams$ngram, freq = text_bigrams$n, min.freq = 2,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
text_trigrams <- trigrams_tokenizer(train.corpus$content, remove.stopwords = T)
text_trigrams <- na.omit(text_trigrams)
head(text_trigrams)
## # A tibble: 6 × 2
## ngram n
## <chr> <int>
## 1 happy mothers day 37
## 2 new york city 37
## 3 new york times 26
## 4 im pretty sure 23
## 5 dont even know 21
## 6 let us know 21
library(ggplot2)
text_trigrams %>% top_n(10) %>% ggplot(aes(ngram, n)) + geom_col() + xlab(NULL) + coord_flip()
# Generate word cloud
set.seed(1234)
wordcloud(words = text_trigrams$ngram, freq = text_trigrams$n, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
The exploratory analyses above, especially the n-gram models and frequencies, will help shape what algorithms I decide to move forward with. The frequency of n-grams helped to understand the frequencies of various combinations and how many are needed to predict most of the data. I may also create 3- and 4-gram models to assist in this process. Once I decide on an algorithm, I will develop an app with an easy-to-understand graphical user interface where the person typing can enter a string of text into a search box and receive a list of predicted words.
create.text.df <- function(original, sample.percentage = 0.1, book = "default")
{
set.seed(1)
if(!file.exists(original))
{
print("no file")
return(NULL)
}
f <- file(original, "rb")
original.text <- readLines(f, encoding = "UTF-8", skipNul = TRUE)
close(f)
n.lines <- sort(sample(1:length(original.text),
as.integer(length(original.text) * sample.percentage),
replace = FALSE))
sampled.text <- original.text[n.lines]
return(data.frame(doc_id = 1:as.integer(length(original.text) * sample.percentage),
book = book,
text = sampled.text,
stringsAsFactors = FALSE))
}
txt.cleaning <- function(x)
{
gsub("i'm", "i am", x)
gsub("i've", "i have", x)
gsub("don't", "do not", x)
gsub("did't", "did not", x)
gsub("doesn't", "does not", x)
gsub("haven't", "have not", x)
gsub("isn't", "is not", x)
gsub("won't", "will not", x)
gsub("can't", "can not", x)
gsub("n't", " not", x)
gsub("'ll", " will", x)
gsub("'re", " are", x)
gsub("'ve", " have", x)
gsub("'m", " am", x)
gsub("'d", " would", x)
gsub("'s", "", x)
gsub("'t",'', x)
gsub('http\\S+\\s*', '', x) ## Remove URLs
gsub('\\b+RT', '', x) ## Remove RT
gsub('#\\S+', '', x) ## Remove #
gsub('*\\S+', '', x) ## Remove *
gsub('@\\S+', '', x) ## Remove @
gsub('[[:cntrl:]]', '', x) ## Remove controls characters
gsub("^[[:space:]]*","",x) ## Remove leading whitespaces
gsub("[[:space:]]*$","",x) ## Remove trailing whitespaces
gsub(' +',' ',x) ## Remove extra whitespaces
gsub("http[[:alnum:]]*",'', x) ## removes alphanumeric characters
gsub(pattern = "\\s*[[:alpha:]]*([[:alpha:]])\\1{2}[[:alpha:]]*",
replacement = " ", x, ignore.case = TRUE)
gsub("^[0-9]|[0-9]$", "", x) ## Remove numbers
gsub('[[:punct:]]', '', x) ## Remove punctuations
}
unigrams_tokenizer <- function(text, remove.stopwords = FALSE)
{
custom.stopwords <- data.frame(word = stopwords('english'),
lexicon = "mylexicon")
tibble(text = text) %>%
mutate(text = replace_contraction(text)) %>% #remove contractions
mutate(word = gsub("'s","", text)) %>%
unnest_tokens(ngram, text, token = "ngrams", n = 1) %>% # unnest tokens by ngram
mutate(stem = wordStem(ngram)) %>% # Stemming
filter(!grepl('[0-9]', ngram)) %>% # remove numbers
filter(!ngram %in% profanity) %>% # remove profanity
{if(remove.stopwords) # remove stop words
filter(.,!ngram %in% custom.stopwords$word) else .} %>%
count(ngram, sort = TRUE)
}
bigrams_tokenizer <- function(text, remove.stopwords = TRUE) {
custom.stopwords <- data.frame(word = stopwords('english'),
lexicon = "mylexicon")
tibble(text = text) %>%
mutate(word = gsub("'s","", text)) %>%
mutate(word = gsub("'t","", text)) %>%
mutate(text = replace_contraction(text)) %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2) %>%
mutate(stem = wordStem(ngram)) %>% # Stemming
filter(!grepl('[0-9]', ngram)) %>% # remove numbers
filter(!ngram %in% profanity) %>% # remove profanity
#drop_na() %>%
na.omit() %>%
separate(ngram, c("word1","word2")) %>%
{if(remove.stopwords)
filter(.,!word1 %in% custom.stopwords$word,
!word2 %in% custom.stopwords$word) else .} %>%
count(word1, word2, sort = TRUE) %>%
unite(ngram, word1, word2, sep = " ")
}
trigrams_tokenizer <- function(text, remove.stopwords = FALSE)
{
custom.stopwords <- data.frame(word = stopwords('english'),
lexicon = "mylexicon")
tibble(text = text) %>%
mutate(text = replace_contraction(text)) %>% #remove contractions
unnest_tokens(ngram, text, token = "ngrams", n = 3, drop = T,
to_lower = T) %>%
mutate(stem = wordStem(ngram)) %>% # stemming
filter(!grepl('[0-9]', ngram)) %>% # remove numbers
filter(!ngram %in% profanity) %>% # remove profanity
drop_na() %>%
separate(ngram, c("word1","word2","word3")) %>%
{if(remove.stopwords)
filter(.,!word1 %in% custom.stopwords$word,
!word2 %in% custom.stopwords$word,
!word3 %in% custom.stopwords$word) else .} %>%
count(word1, word2, word3, sort = TRUE) %>%
unite(ngram, word1, word2, word3, sep = " ", na.rm = TRUE)
}