We are going to produce R code that mimics word-prediction algorithms for mobile text messaging. The joint venture between John Hopkins University and Swiftkey provided training data composed of Twitter posts, blogs, and news feeds. The files are also in the following languages: American English, Finnish, German, and Russian. The algorithm presented below will emphasize speed, and yet will hopefully yield similar results—that is, still predict what would the user wants to type next—as the current, memory-intensive methods.
Let us now look at the data. Each of the 12, given files are about 115 MB in size on average.
English_blogs <- read.table("en_US.blogs.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
English_news <- read.table("en_US.news.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
English_Twitter <- read.table("en_US.twitter.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
Finnish_blogs <- read.table("fi_FI.blogs.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
Finnish_news <- read.table("fi_FI.news.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
Finnish_Twitter <- read.table("fi_FI.twitter.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
German_blogs <- read.table("de_DE.blogs.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
German_news <- read.table("de_DE.news.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
German_Twitter <- read.table("de_DE.twitter.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
Russian_blogs <- read.table("ru_RU.blogs.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
Russian_news <- read.table("ru_RU.news.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
Russian_Twitter <- read.table("ru_RU.twitter.txt",
quote = "",
sep = "\n",
stringsAsFactors = FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input
The nrow function will quickly find the number of lines of text in each data set.
nrow(English_blogs)
## [1] 898384
nrow(English_news)
## [1] 77258
nrow(English_Twitter)
## [1] 2302307
nrow(Finnish_blogs)
## [1] 439715
nrow(Finnish_news)
## [1] 485758
nrow(Finnish_Twitter)
## [1] 278943
nrow(German_blogs)
## [1] 181909
nrow(German_news)
## [1] 244739
nrow(German_Twitter)
## [1] 929660
nrow(Russian_blogs)
## [1] 337075
nrow(Russian_news)
## [1] 196360
nrow(Russian_Twitter)
## [1] 875002
Using the caret package (short for “classification and regression training”), we can quickly produce a random selection sample (say, 20% going into the training set) of the English language files.
require(caret)
training_blogs = createDataPartition(English_blogs$V1, p = 0.2)
training_news = createDataPartition(English_news$V1, p = 0.2)
training_tweets = createDataPartition(English_Twitter$V1, p = 0.2)
However, grabbing even just 20% of the dataset was too taxing for my computer.
Instead, let us take a more crude approach and simply grab, say, 250 lines of each English file.
num_lines <- 250
English_blogs_sample <- English_blogs[sample(nrow(English_blogs), num_lines), ]
English_news_sample <- English_news[sample(nrow(English_news), num_lines), ]
English_Twitter_sample <- English_Twitter[sample(nrow(English_Twitter), num_lines), ]
From here, we can further separate the data samples into “words”.
words_in_blogs <- unlist(lapply(strsplit(as.character(English_blogs_sample), " "), function(x) x))
words_in_news <- unlist(lapply(strsplit(as.character(English_news_sample), " "), function(x) x))
words_in_Twitter <- unlist(lapply(strsplit(as.character(English_Twitter_sample), " "), function(x) x))
English_words <- c(words_in_blogs, words_in_news, words_in_Twitter)
More specificically, here are the number of words in each sample, data set.
length(words_in_blogs)
## [1] 10973
length(words_in_news)
## [1] 8614
length(words_in_Twitter)
## [1] 2868
Here we will look at the patterns in the words themselves. First, we can continue to use the lapply coding to extract the first letter of each word and the length of each word.
first_letters <- sapply(substring(English_words, 1, 1), function(x) x,
simplify = "array", USE.NAMES = FALSE)
word_lengths <- nchar(English_words)
To get a sense of the distribution of how words start (i.e. their first letters), here is a bar chart. Note: the first letter was converted to lower-case for the sake of brevity for the graph. That forced conversion will not be done anywhere else in this project.
require(ggplot2)
qplot(tolower(first_letters))
The following histogram then shows the distribution for the lengths of the words (whose lengths are fewer than 11 characters) in the data sets.
lengths_of_practical_words <- word_lengths*(word_lengths < 11)
hist(lengths_of_practical_words)
Let us try to make a word cloud with that sample of English words.
require(tm)
require(wordcloud)
English_corpus <- Corpus(DataframeSource(data.frame(English_words)))
English_matrix <- as.matrix(TermDocumentMatrix(English_corpus))
word_counts <- sort(rowSums(English_matrix),decreasing=TRUE)
freq_df <- data.frame(word = names(word_counts),freq=word_counts)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(freq_df$word, freq_df$freq, scale=c(8,.2),min.freq=3,
max.words=50, random.order=FALSE, rot.per=.15, colors=pal2)
How about a word cloud without “stop words”?
English_corpus_clean <- tm_map(English_corpus, function(x) removeWords(x, stopwords("english")))
English_matrix <- as.matrix(TermDocumentMatrix(English_corpus_clean))
word_counts <- sort(rowSums(English_matrix),decreasing=TRUE)
freq_df <- data.frame(word = names(word_counts),freq=word_counts)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(freq_df$word, freq_df$freq, scale=c(8,.2),min.freq=3,
max.words=50, random.order=FALSE, rot.per=.15, colors=pal2)
Eventually, we will want to build the prediction mechanism, so let us find the next few words in the collection for every place in our sample.
require(binhf)
next1 <- shift(English_words, 1, dir = "left")
next2 <- shift(English_words, 2, dir = "left")
next3 <- shift(English_words, 3, dir = "left")
next4 <- shift(English_words, 4, dir = "left")
next5 <- shift(English_words, 5, dir = "left")
word_canvas <- data.frame(English_words, first_letters, word_lengths, next1, next2, next3, next4, next5)
For example, we can look at our first line of words in the sample …
English_blogs_sample[1]
## [1] "Eggplants are easy to start from seed. See our sidebar for our favorite seed suppliers. We prefer to start our seeds in pots, but direct sowing is also fine. The seeds are fairly small, so don't plant too deeply. Keep moist until germination, and then be sure to protect the seedlings from any late frost. Temperatures must be warm for the seeds to germinate, so you may consider starting seeds indoors if the ambient temperature is still chilly in your area. Here in Houston we're already hitting the high 70's and low 80's during the day making it perfect eggplant-starting weather."
… and see how the next few words are organized in our data frame:
head(word_canvas)
## English_words first_letters word_lengths next1 next2 next3 next4
## 1 Eggplants E 9 are easy to start
## 2 are a 3 easy to start from
## 3 easy e 4 to start from seed.
## 4 to t 2 start from seed. See
## 5 start s 5 from seed. See our
## 6 from f 4 seed. See our sidebar
## next5
## 1 from
## 2 seed.
## 3 See
## 4 our
## 5 sidebar
## 6 for
Now we can finally dissect the possibilites for the next word based on the first letter of what the user is typing, and the overall length of the word. For example, if the first letter of the current word is “D” and the current word’s length is 5, then the possibilities for the next word include:
next1_for_d <- word_canvas[first_letters == "D" | first_letters == "d", ]$next1
this_corpus <- Corpus(DataframeSource(data.frame(next1_for_d)))
this_corpus <- tm_map(this_corpus, function(x) removeWords(x, stopwords("english")))
this_matrix <- as.matrix(TermDocumentMatrix(this_corpus))
word_counts <- sort(rowSums(this_matrix),decreasing=TRUE)
d_df <- data.frame(word = names(word_counts),freq=word_counts)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(d_df$word, d_df$freq, scale=c(8,.2),min.freq=2,
max.words=50, random.order=FALSE, rot.per=.15, colors=pal2)
next1_for_5 <- word_canvas[word_lengths == 5, ]$next1
this_corpus <- Corpus(DataframeSource(data.frame(next1_for_5)))
this_corpus <- tm_map(this_corpus, function(x) removeWords(x, stopwords("english")))
this_matrix <- as.matrix(TermDocumentMatrix(this_corpus))
word_counts <- sort(rowSums(this_matrix),decreasing=TRUE)
freq_df <- data.frame(word = names(word_counts),freq=word_counts)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(freq_df$word, freq_df$freq, scale=c(8,.2),min.freq=2,
max.words=50, random.order=FALSE, rot.per=.15, colors=pal2)