Synopsis

The goal of this exercise is to create a product that deals with natural language processing in some way. The application needs to include a user-implemented prediction algorithm and an interface that can be accessed by others. I decided to create a Shiny app that receives an input phrase in a text box input and outputs a prediction of the next word. Specifically, we will need to include the following in our research:

Import Statements

First let’s read in the libraries and data.

## Import libraries
library(knitr)
library(quanteda)
library(stringr)

## Import data
twit <- readLines("~/Desktop/final/en_US/twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blog <- readLines("~/Desktop/final/en_US/blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/Desktop/final/en_US/news.txt", encoding = "UTF-8", skipNul = TRUE)

## Set the seed
set.seed(456)

Data Preprocessing

After reading in the raw datasets, we’d like to find the most commonly used words by calculating the frequencies for each word. In order to speed up the processing time during our calculations, we’ll need to take a sample from the three datasets, but ensure that the sample is large enough to properly represent the data. In the fields of computational linguistics and probability, an “n-gram” is a contiguous sequence of “n” items from a given sequence of text or speech. Here, we will find the top 10 unigrams, bigrams, trigrams, and quadgrams from the sample.

## Sample the data
sample <- sample(c(twit, news, blog), 500000, replace = FALSE)

## Remove non-ASCII characters
sample <- iconv(sample, "UTF-8", "ASCII", sub="")

## Clean the text
sample <- gsub("[[:punct:]]", "", sample) # removing special characters
sample <- gsub("http.* *", "", sample) # removing urls
sample <- gsub("\\d", "", sample) # removing numbers
sample <- gsub("\\s+", " ", sample) # remove extra spaces

Calculating the Frequencies of N-Grams

The prediction algorithm is based on an N-gram model with “Stupid Backoff.” The algorithm checks to see if the highest-order (in this case, n=5) n-gram has been seen. If the highest-order is not applicable, the predicted word will “degrade” to a lower-order model (n=4, then 3, then 2). For additional prediction accuracy, common linguistic tweaks were implemented, as well.

## Create document-term matrices
dfm2 <- dfm(sample, ngrams = 2, verbose = FALSE, concatenator = " ")
dfm3 <- dfm(sample, ngrams = 3, verbose = FALSE, concatenator = " ")
dfm4 <- dfm(sample, ngrams = 4, verbose = FALSE, concatenator = " ")
dfm5 <- dfm(sample, ngrams = 5, verbose = FALSE, concatenator = " ")

## Get frequencies of each word
dfm2 <- as.data.frame(as.matrix(docfreq(dfm2)))
dfm3 <- as.data.frame(as.matrix(docfreq(dfm3)))
dfm4 <- as.data.frame(as.matrix(docfreq(dfm4)))
dfm5 <- as.data.frame(as.matrix(docfreq(dfm5)))

## Sort by frequency
dfm2 <- sort(rowSums(dfm2), decreasing=TRUE)
dfm3 <- sort(rowSums(dfm3), decreasing=TRUE)
dfm4 <- sort(rowSums(dfm4), decreasing=TRUE)
dfm5 <- sort(rowSums(dfm5), decreasing=TRUE)

## Create data frames for future plots
dfm2 <- data.frame(word = names(dfm2), freq = dfm2, row.names = 1:length(dfm2))
dfm3 <- data.frame(word = names(dfm3), freq = dfm3, row.names = 1:length(dfm3))
dfm4 <- data.frame(word = names(dfm4), freq = dfm4, row.names = 1:length(dfm4))
dfm5 <- data.frame(word = names(dfm5), freq = dfm5, row.names = 1:length(dfm5))

## Convert "word" variables to character class
dfm2$word <- as.character(dfm2$word)
dfm3$word <- as.character(dfm3$word)
dfm4$word <- as.character(dfm4$word)
dfm5$word <- as.character(dfm5$word)

## Convert data to list
dfm2 <- strsplit(dfm2$word, " ")
dfm3 <- strsplit(dfm3$word, " ")
dfm4 <- strsplit(dfm4$word, " ")
dfm5 <- strsplit(dfm5$word, " ")

## Convert list to data frames
dfm2 <- data.frame(matrix(unlist(dfm2), nrow = 2981782, byrow = TRUE), stringsAsFactors = FALSE)
dfm3 <- data.frame(matrix(unlist(dfm3), nrow = 7130898, byrow = TRUE), stringsAsFactors = FALSE)
dfm4 <- data.frame(matrix(unlist(dfm4), nrow = 9201126, byrow = TRUE), stringsAsFactors = FALSE)
dfm5 <- data.frame(matrix(unlist(dfm5), nrow = 9504291, byrow = TRUE), stringsAsFactors = FALSE)

## Adjust columns
dfm2 <- data.frame(phrase = dfm2[,1], word = dfm2[,2])
dfm3 <- data.frame(phrase = paste(dfm3[,1], dfm3[,2], sep = " "), word = dfm3[,3])
dfm4 <- data.frame(phrase = paste(dfm4[,1], dfm4[,2], dfm4[,3], sep = " "), word = dfm4[,4])
dfm5 <- data.frame(phrase = paste(dfm5[,1], dfm5[,2], dfm5[,3], dfm5[,4], sep = " "), word = dfm5[,5])

## Remove duplicated rows but keep the row with the first observation
dfm2 <- dfm2[!duplicated(dfm2$phrase),]
dfm3 <- dfm3[!duplicated(dfm3$phrase),]
dfm4 <- dfm4[!duplicated(dfm4$phrase),]
dfm5 <- dfm5[!duplicated(dfm5$phrase),]

## Reset rownames
row.names(dfm2) <- 1:nrow(dfm2)
row.names(dfm3) <- 1:nrow(dfm3)
row.names(dfm4) <- 1:nrow(dfm4)
row.names(dfm5) <- 1:nrow(dfm5)

## Re-convert to character variable
dfm2$word <- as.character(dfm2$word)
dfm3$word <- as.character(dfm3$word)
dfm4$word <- as.character(dfm4$word)
dfm5$word <- as.character(dfm5$word)

dfm2$phrase <- as.character(dfm2$phrase)
dfm3$phrase <- as.character(dfm3$phrase)
dfm4$phrase <- as.character(dfm4$phrase)
dfm5$phrase <- as.character(dfm5$phrase)

Example Phrase

We want to test our prediction algorithm to see if we receive an acceptable suggested word by providing our program an example phrase. In this case, we provided the phrase My, 1 dog, since the phrase contains extra spaces, punctuation marks, numbers, and capital letters, which will be a good test for our algorithm. Feel free to adjust the phrase to some other text, but we will work with this phrase for now. After running our algorithm, our predicted word that has been suggested to us is is. According to the data and our document feature matrix, this is the most frequent word that proceeds the phrase my dog, so our algorithm has proven to be effective.

## Example phrase
phrase <- "My,  1 dog"

## Clean input text
phrase <- gsub("\\d", "", phrase) # removing numbers
phrase <- gsub("[[:punct:]]", "", phrase) # removing special characters
phrase <- gsub("\\s+", " ", phrase) # removing extra spaces
phrase <- tolower(phrase) # removing capital letters

## Get number of words in the phrase
phrase <- strsplit(phrase, " ")[[1]]
nWords <- length(phrase)

## Get the last four or less words in the phrase
phrase <- tail(phrase, n = 4)

## Convert to a string
phrase <- paste(phrase, collapse = " ")

## Predict the next word given a phrase
if (nWords == 0) {
    word <- "the"
} else if (nWords == 1) {
    word <- dfm2[dfm2$phrase == phrase,][1,2]
} else if (nWords == 2){
    word <- dfm3[dfm3$phrase == phrase,][1,2]
} else if (nWords == 3) {
  word <- dfm4[dfm4$phrase == phrase,][1,2]
} else {
  word <- dfm5[dfm5$phrase == phrase,][1,2]
}

## Output the predicted word
word
## [1] "is"
## Check to see if this is correct
dfm3[12544,]
##         phrase word
## 12544 and salt  and