The purpose of this project is to create a web-based system that will predict the most probable next words based on user input. The prediction algorithm will use data from three data sets from SwiftKey (Tweets, News, and Blog posts) to learn about probable phrases, and the application will use these probabilities to predict the most likely next words based on user input. (Note: See Appendix to view the code used in this report)
The following terms are used in this report:
The Exploratory Data Analysis (EDA) methodology for this project will be to:
The system will use Markov prediction based on the probabilities of bigrams and trigrams in the combined data sets. Markov prediction determines what the next likely word based on shorter n-grams rather than whole sentences. This will perform better than attempting to assign probabilities to whole phrases, as the data sets cannot contain all possible sequences of words that could be entered by the user.
Exploratory Data Analysis was done on three data sets made available by SwiftKey:
| Data Set | Line Count | Word Count |
|---|---|---|
| Blog Posts | 899,288 | 37,272,578 |
| News | 1,010,242 | 34,309,642 |
| Twitter Tweets | 2,360,148 | 30,341,028 |
| Consolidated | 4,269,678 | 101,923,248 |
The data sets were cleaned, removing punctuation and extraneous white space, and converted to lower case. The data sets were then combined into a consolidated data set.
| Library | Purpose |
|---|---|
| dplyr | Data manipulation |
| ggplot | Creation of plots |
| RWeka | Used to create n-grams |
| tm | Used to create corpora |
A sample of 1% of the total combined data set was created, equating to 33367 lines of the total 4,269,678 lines of text. This sample was analyzed to find most common unigrams, bigrams, and trigrams.
| ngram | frequency | word |
|---|---|---|
| unigrams | 29721 | the |
| unigrams | 15678 | and |
| unigrams | 8543 | you |
| unigrams | 7892 | for |
| unigrams | 7360 | that |
| unigrams | 4921 | with |
| unigrams | 4274 | this |
| unigrams | 4144 | was |
| unigrams | 3925 | have |
| unigrams | 3573 | are |
| bigrams | 2652 | of the |
| bigrams | 2456 | in the |
| bigrams | 1431 | for the |
| bigrams | 1360 | to the |
| bigrams | 1316 | on the |
| bigrams | 1193 | to be |
| bigrams | 898 | at the |
| bigrams | 785 | i have |
| bigrams | 763 | in a |
| bigrams | 746 | is a |
| trigrams | 247 | thanks for the |
| trigrams | 234 | one of the |
| trigrams | 199 | a lot of |
| trigrams | 149 | to be a |
| trigrams | 137 | going to be |
| trigrams | 128 | i want to |
| trigrams | 108 | i have a |
| trigrams | 108 | i love you |
| trigrams | 108 | out of the |
| trigrams | 104 | looking forward to |
require(tm)
require(RWeka)
require(dplyr)
require(stringi)
require(ggplot2)
SAMPLE.PERCENT <- 1
options(scipen=99)
options(stringsAsFactors = FALSE)
if (!file.exists("Coursera-Swiftkey.zip")) {
# Download the raw ZIP file
download.file(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile = "Coursera-Swiftkey.zip")
# List the file contents
unzip(zipfile = "Coursera-Swiftkey.zip", list = TRUE)
# Unzip the parts
unzip(zipfile="Coursera-Swiftkey.zip",
files = c("final/en_US/en_US.twitter.txt",
"final/en_US/en_US.news.txt",
"final/en_US/en_US.blogs.txt"))
}
# This function loads the corpus text and saves it to a save file.
# The next time, we don't have to load the large file.
load.corpus.text <- function(corpus.name) {
# If the save file doesn't exist, load the data from the txt file
if (!file.exists(paste(corpus.name, ".sav", sep=""))) {
fname <- paste("final/en_US/en_US.", corpus.name, ".txt", sep="")
tmp.corpus <- scan(fname, sep="\n", what="character",
encoding="UTF-8")
save(tmp.corpus, file=paste(corpus.name, ".sav", sep=""))
return(tmp.corpus)
} else {
load(file = paste(corpus.name, ".sav", sep=""))
return(tmp.corpus)
}
}
# Call the function for each kind of text
twitter <- load.corpus.text("twitter")
news <- load.corpus.text("news")
blogs <- load.corpus.text("blogs")
# Create one big vector of the three text sources
all.text <- c(twitter, news, blogs)
SAMPLE.SIZE <- SAMPLE.PERCENT/100 * length(all.text)
# create a corpus with a percentage sample of all text
set.seed(42)
smp <- sample(1:length(all.text), size = SAMPLE.SIZE)
all.text.raw <- VCorpus(VectorSource(all.text[smp]))
# clean text
cleanCorpus <- function(corpus, stop.words=FALSE, stems=FALSE) {
# This function makes the corpus consistent and
# removes any unwanted characters.
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower) )
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
# We normally don't want to remove stopwords here, because we will
# later use these to make predictions.
if (stop.words) {
corpus.tmp <- tm_map(corpus.tmp, removeWords,
c(stopwords("english"),"foo"))
}
# We normally won't remove stems; they're important for prediction
if (stems) {
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
}
return(corpus.tmp)
}
all.text.corpus <- cleanCorpus(all.text.raw)
# Create one TermDocumentMatrix each for
# unigrams, bigrams, and trigrams
tdm.unigrams <- TermDocumentMatrix(all.text.corpus)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigrams <- TermDocumentMatrix(all.text.corpus, control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm.trigrams <- TermDocumentMatrix(all.text.corpus, control = list(tokenize = TrigramTokenizer))
# Print a nice table of most frequent words in each n-gram set
knitr::kable(df.plot.ngrams, format="markdown",
caption="10 most common unigrams, bigrams, and trigrams")
##################
# Pareto plot of n-gram
# This function creates a plot to display the comparative
# frequency of the top 25 words in each set of n-grams.
pareto.ngram <- function(df.ngram, plot.title,
how.many=25, fill.color="blue") {
ggplot(df.ngram[1:how.many,]) +
aes(reorder(word, -frequency), frequency) +
theme(axis.text.x=element_text(angle=45)) +
geom_bar(stat="identity", fill=fill.color) +
xlab("n-gram") +
ggtitle(plot.title)
}
# Call the function to create plots for each set of n-grams.
pareto.ngram(df.unigrams, plot.title="25 most common unigrams",
how.many=25, fill.color="brown")
pareto.ngram(df.bigrams, plot.title="25 most common bigrams",
how.many=25, fill.color="dark green")
pareto.ngram(df.trigrams, plot.title="25 most common trigrams",
how.many=25, fill.color="red")