Setup the document & load needed libraries

knitr::opts_chunk$set(echo=T)
suppressPackageStartupMessages(library(tm))

Load and combine the English datasets

blogs <- readLines("../final/en_US/en_US.blogs.txt", skipNul=T, encoding="UTF-8")
news <- readLines("../final/en_US/en_US.news.txt", skipNul=T, encoding="UTF-8")
twits <- readLines("../final/en_US/en_US.twitter.txt", skipNul=T, encoding="UTF-8")
eng_data <- c(blogs, news, twits)
saveRDS(object=eng_data, file="eng_data.rds")

Implement a prediction function

# Define helper function & read helper file
delPat <- content_transformer(function(strng, pat) gsub(pattern=pat, replacement=" ", x=strng))
profane_words <- read.delim(file="../Milestone/badwords.txt", header=F)[, 1]

nextWord <- function(inp_txt) { 
  # Create a corpus from the entries containing the input text
  entries <- eng_data[grepl(pattern=inp_txt, x=eng_data, ignore.case=T)]
  if (length(entries) == 0) { return(data.frame(Word="-", Counts="None")) }
  regex_str <- paste(inp_txt, "([^ ]+)")
  targetWords <- ''
  for (i in 1:length(entries)) { 
    match_idx <- regexec(pattern=regex_str, text=entries[i], ignore.case=T)
    targetWords <- c(targetWords, regmatches(x=entries[i], m=match_idx)[[1]][2]) }
  corp <- VCorpus(VectorSource(data.frame(targetWords)))
  
  # Clean the corpus
  corp <- tm_map(x=corp, FUN=delPat, "(f|ht)tp(s?)://(.*)[.][a-z]+")
  corp <- tm_map(x=corp, FUN=delPat, "[^a-zA-Z ]") 
  corp <- tm_map(x=corp, FUN=removePunctuation)
  corp <- tm_map(x=corp, FUN=removeNumbers)
  corp <- tm_map(x=corp, FUN=content_transformer(FUN=tolower))
  corp <- tm_map(x=corp, FUN=removeWords, stopwords(kind="en"))
  corp <- tm_map(x=corp, FUN=removeWords, profane_words)
  corp <- tm_map(x=corp, FUN=stripWhitespace)
  corp <- tm_map(x=corp, FUN=PlainTextDocument)
  
  # Compute frequencies of each word/unigram
  docTermMat <- as.matrix(x=DocumentTermMatrix(corp))
  freq <- sort(colSums(x=docTermMat), decreasing=T)
  df <- data.frame(Word=names(freq), Counts=freq)
  rownames(df) <- 1:length(freq)
  numWord <- max(1, min(length(freq), 10))
  return(df[1:numWord, ])
}

Estimate computational complexity

The function was timed for different length of input text:

system.time(expr=nextWord("in a case of"))

##    user  system elapsed 
##   62.27    5.20   93.03

system.time(expr=nextWord("a case of"))

##    user  system elapsed 
##   49.16    0.23   51.56

system.time(expr=nextWord("case of"))

##    user  system elapsed 
##   53.20    0.10   58.47

The complexity of the function is found to be proportional to the length of the provided text in general. However, input phrases consisting of 3 words seem to provide some compensations in this case.

Test the algorithm

The quiz questions are answered using the prediction function defined:

nextWord("a case of")

##        Word Counts
## 1      beer     19
## 2      wine      7
## 3     first      4
## 4    miller      4
## 5  mistaken      4
## 6    making      3
## 7       one      3
## 8   waiting      3
## 9     water      3
## 10  crossed      2

nextWord("would mean the")

##        Word Counts
## 1     world    203
## 2  absolute      2
## 3       end      2
## 4    entire      2
## 5   airport      1
## 6    angles      1
## 7   central      1
## 8   century      1
## 9     death      1
## 10   person      1

nextWord("make me the")

##        Word Counts
## 1  happiest     28
## 2      best      2
## 3    worlds      2
## 4       bad      1
## 5   biggest      1
## 6     blame      1
## 7       bun      1
## 8  daughter      1
## 9     first      1
## 10     girl      1

nextWord("struggling but")

##        Word Counts
## 1  avoiding      1
## 2  remember      1
## 3 westbrook      1

nextWord("date at the")

##        Word Counts
## 1       end      5
## 2      time      3
## 3       app      1
## 4       art      1
## 5    bottom      1
## 6    braves      1
## 7      cake      1
## 8    cheese      1
## 9  driskill      1
## 10     four      1

nextWord("be on my")

##     Word Counts
## 1    way     36
## 2   mind     14
## 3   show      8
## 4   list      7
## 5   side      7
## 6   best      5
## 7   game      5
## 8   ipod      5
## 9  couch      4
## 10 radio      4

nextWord("in quite some")

##   Word Counts
## 1 time     36

nextWord("with his little")

##           Word Counts
## 1      brother      4
## 2      bedroom      1
## 3        bitty      1
## 4     brothers      1
## 5        claws      1
## 6      company      1
## 7      cousins      1
## 8     daughter      1
## 9          dog      1
## 10 embroidered      1

nextWord("faith during the")

##      Word Counts
## 1 worship      1

nextWord("you must be")

##         Word Counts
## 1       able     29
## 2    willing     12
## 3      bored      9
## 4     really      9
## 5  wondering      9
## 6       born      8
## 7        one      8
## 8      tired      8
## 9   watching      7
## 10       new      6

Conclusions

The function does not work all the time, especially when there are limited number of occurrences of specific phases in the training dataset.
A bit more effort is required to turn the function into a predictive model. The model needs to output one specific word, which logically has to be the word that occurs the most frequently.
The biggest advantage of the model is that any length of input text could potentially be provided, albeit the probably of finding no occurrence in the dataset increases with the input text length. On the other hand, long processing time is the main disadvantage of the model. This could potentially be fixed by avoiding for loops in the model and piping continuous expressions.
However, the downside of the current model is not feasible if a Shiny app were to be produced. The length of the input text would have to be limited to n. (n+1)-grams could potentially be created and stored up front to reduce the processing time.

Data Science Capstone Quiz 2