A smart keyboard can make typing on mobile devices easier in our daily lives, and there are companies developing such technology using natural language processing and predictive text models. SwiftKey, the capstone project partner, is one of the leading companies in the field and provided data for data scientists to build a predictive text product. In this exploratory data analysis, we will perform some basic analyses to understand the text data sets and lay out a preliminary plan for the steps in prediction model selections.
This is the training data to get started, Dataset. It consists 4 locales, en_US, de_DE, ru_RU, and fi_FI. For this project, we will explore the text files in english (en_US).
## Load the necessary packages
library(ggplot2)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(ngram)
## set to the desired directory and create a data directory
if (!file.exists("data")){
dir.create("data")
}
## Download the data
dataURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(dataURL, destfile = "./data/swiftkey.zip", method = "curl")
unzip("./data/swiftkey.zip", exdir = "./data")
## file.remove("./data/swiftkey.zip")
## get the path directory of the en_US .txt files
flist <- list.files(path = "./data",
recursive = TRUE,
pattern = ".*en_.*.txt")
flist <- paste("./data", flist, sep="/")
## custom function to get the line count, word count, and the file size
finfo <- sapply(flist, function(f){
# file size in MB
fsize <- file.info(f)[1]/1024/1024
# read lines of the file
con <- file(f, open="r")
tf <- readLines(con)
close(con)
# line count of file
lc <- length(tf)
# word count of file
wc <- sum(nchar(tf))
data.frame(fsize, lc, wc)
})
# convert finfo matrix to data frame
finfo <- data.frame(t(finfo))
colnames(finfo) <- c("file_size_MB", "line_count", "word_count")
The result of basic summary of the en_US text files indicates large file size and large line & word counts, therefore, it’s faster with approximately 1% of the corpus to see the distribution of frequent words in the next step.
## sample 1% of text corpus
t_sub <- function(f, percent=0.01){
con <- file(f, open="r")
tf <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# remove non-English words
tf <- iconv(tf, "latin1", "ASCII", sub = "")
set.seed(1011)
sample(tf, length(tf) * percent)
}
## create a corpus from samples of the 3 en_US *.txt files
tCorpus <- VCorpus(VectorSource(sapply(flist, t_sub)))
Use tm_map() function from tm package to perform transformations on the Corpus created above. Examples of transformation include removing whitespace, remove storwords, stemming, etc.
# eliminate extra whitespace
tCorpus <- tm_map(tCorpus, stripWhitespace)
# convert to lower case
tCorpus <- tm_map(tCorpus, content_transformer(tolower))
# remove stopwords
tCorpus <- tm_map(tCorpus, removeWords, stopwords("english"))
# stemming
tCorpus <- tm_map(tCorpus, stemDocument)
# remove numbers
tCorpus <- tm_map(tCorpus, removeNumbers)
# remove punctuations
tCorpus <- tm_map(tCorpus, removePunctuation)
Next, we convert the corpus to a document matrix to count the word frequency, which can be visualized with a word cloud.
# create a term-document matrix
dtm <- DocumentTermMatrix(tCorpus)
dtm_mat <- as.matrix(dtm)
# tally up the frequency of each word across 3 text files, and sort in descending order
words <- sort(colSums(dtm_mat), decreasing = TRUE)
dtm_df <- data.frame(word = names(words), freq = words)
# create a word cloud to show more frequent words
set.seed(1234)
wordcloud::wordcloud(words = dtm_df$word,
freq = dtm_df$freq,
min.freq = 500,
max.words = 300,
rot.per = 0.35,
random.order = FALSE,
scale = c(2, 0.2),
colors = brewer.pal(8, "Dark2"))
An n-gram model is a type of probabilistic language model for predicting the next item in such a sequence in the form of a (n-1)-order Markov model.
NOTE: Uni-gram is equivalent to what the previous word cloud shows the frequency of each word in the 3 text files.
# create tokenizer for 1-4 grams
unigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse=" "), use.names = FALSE)
bigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse=" "), use.names = FALSE)
trigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse=" "), use.names = FALSE)
quadgramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 4), paste, collapse=" "), use.names = FALSE)
# convert corpus to document term matrix per n-grams
unigram_dtm <- DocumentTermMatrix(tCorpus, control = list(tokenize=unigramTokenizer))
bigram_dtm <- DocumentTermMatrix(tCorpus, control = list(tokenize=bigramTokenizer))
trigram_dtm <- DocumentTermMatrix(tCorpus, control = list(tokenize=trigramTokenizer))
quadgram_dtm <- DocumentTermMatrix(tCorpus, control = list(tokenize=quadgramTokenizer))
# a function to tally up the frequency of each word across 3 text files, and sort in descending order
ngram_dtm_df <- function(x) {
x_mat <- as.matrix(x)
n_grams <- sort(colSums(x_mat), decreasing = TRUE)
x_df <- data.frame(n_grams = names(n_grams), freq = n_grams)
}
unigram_df <- ngram_dtm_df(unigram_dtm)
bigram_df <- ngram_dtm_df(bigram_dtm)
trigram_df <- ngram_dtm_df(trigram_dtm)
quadgram_df <- ngram_dtm_df(quadgram_dtm)
# plot the top frequent words or 2-/3-/4-grams phrases in the combined corpus
ggplot(unigram_df[1:25,], aes(x=reorder(n_grams, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab("(Unigram) Words") +
ylab("Frequency") +
labs(title = "Top 25 Frequent Words")
ggplot(bigram_df[1:25,], aes(x=reorder(n_grams, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab("Bigram Words") +
ylab("Frequency") +
labs(title = "Top 25 Frequent Bigrams")
ggplot(trigram_df[1:25,], aes(x=reorder(n_grams, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab("Trigram Words") +
ylab("Frequency") +
labs(title = "Top 25 Frequent Trigrams")
ggplot(quadgram_df[1:25,], aes(x=reorder(n_grams, freq), y=freq)) +
geom_bar(stat = "identity") +
coord_flip() +
xlab("Quadgram / 4-Grams Words") +
ylab("Frequency") +
labs(title = "Top 25 Frequent 4-Grams")
The prediction of the n-th word in n-grams model is based on the (n-1)-gram of the n-gram dictionary. For example if a user typed “let us” (bi-grams) on a keyboard, the prediction model will look through the tri-gram dictionary and suggest the next word based on the most frequent tri-grams starting with “let us”.
## n_grams freq
## let us know let us know 31
## let us help let us help 4
## let us choose let us choose 1