Setup Document

The knitr options and libraries needed are first loaded:

knitr::opts_chunk$set(echo=T)
set.seed(77)
suppressPackageStartupMessages(library(tm))

Prepare Datasets

The whole English dataset is rather large, consisting of more than 3 million entries in total. To reduce the time taken to implement the model, a subset of 1% of this size is first extracted, and then split into training (80%) and validation (20%) sets. The model will be evaluated on the quiz questions so the size of the testing set is only 10:

# Load whole English dataset
engData <- readRDS(file="../Quiz2/eng_data.rds")
subsetProp <- 0.01
train_val_data <- sample(x=engData, size=subsetProp * length(engData))

# Train-validation split
trainProp <- 0.8
sampleNum <- length(x=train_val_data)
trainIdx <- sample(x=1:sampleNum, size=sampleNum * trainProp)
valIdx <- setdiff(x=1:sampleNum, y=trainIdx)
train <- train_val_data[trainIdx]
validation <- train_val_data[valIdx]

Create Clean N-grams Data

# Define n-gram tokenization functions
bigram <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse=" "), use.names=F)
trigram <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse=" "), use.names=F)
quadrigram <- function(x) unlist(lapply(ngrams(words(x), 4), paste, collapse=" "), use.names=F)
pentagram <- function(x) unlist(lapply(ngrams(words(x), 5), paste, collapse=" "), use.names=F)
hexagram <- function(x) unlist(lapply(ngrams(words(x), 6), paste, collapse=" "), use.names=F)

# Clean the corpus
delPat <- function(strng, pat) gsub(pattern=pat, replacement=" ", x=strng)
profaneWords <- read.delim(file="../Milestone/badwords.txt", header=F)[, 1]
trainCorp <- VCorpus(VectorSource(train))
trainCorp <- tm_map(x=trainCorp, FUN=content_transformer(FUN=delPat), "(f|ht)tp(s?)://(.*)[.][a-z]+")
trainCorp <- tm_map(x=trainCorp, FUN=content_transformer(FUN=delPat), "[^a-zA-Z ]") 
trainCorp <- tm_map(x=trainCorp, FUN=removePunctuation)
trainCorp <- tm_map(x=trainCorp, FUN=removeNumbers)
trainCorp <- tm_map(x=trainCorp, FUN=content_transformer(FUN=tolower))
trainCorp <- tm_map(x=trainCorp, FUN=removeWords, stopwords(kind="en"))
trainCorp <- tm_map(x=trainCorp, FUN=removeWords, profaneWords)
stemTrainCorp <- tm_map(x=trainCorp, FUN=stemDocument)
stemTrainCorp <- tm_map(x=stemTrainCorp, FUN=stripWhitespace)
stemTrainCorp <- tm_map(x=stemTrainCorp, FUN=PlainTextDocument)
unstemTrainCorp <- tm_map(x=trainCorp, FUN=stripWhitespace)
unstemTrainCorp <- tm_map(x=unstemTrainCorp, FUN=PlainTextDocument)

# Compute document term matrices for each n-gram (up to n=4)
docTermMat2 <- DocumentTermMatrix(stemTrainCorp, control=list(tokenize=bigram))
docTermMat3 <- DocumentTermMatrix(stemTrainCorp, control=list(tokenize=trigram))
docTermMat4 <- DocumentTermMatrix(stemTrainCorp, control=list(tokenize=quadrigram))
docTermMat5 <- DocumentTermMatrix(stemTrainCorp, control=list(tokenize=pentagram))
docTermMat6 <- DocumentTermMatrix(stemTrainCorp, control=list(tokenize=hexagram))

# Save the matrices
saveRDS(docTermMat2, file="./dtm2.rds")
saveRDS(docTermMat3, file="./dtm3.rds")
saveRDS(docTermMat4, file="./dtm4.rds")
saveRDS(docTermMat5, file="./dtm5.rds")
saveRDS(docTermMat6, file="./dtm6.rds")
nGramsList <- c("dtm2.rds", "dtm3.rds", "dtm4.rds", "dtm5.rds", "dtm6.rds")

Implement the New Model

# Apply cleaning transformations
txtTransfrom <- function(txt) {
  txt <- delPat(strng=txt, pat="(f|ht)tp(s?)://(.*)[.][a-z]+")
  txt <- delPat(strng=txt, pat="[^a-zA-Z ]") 
  txt <- removePunctuation(x=txt)
  txt <- removeNumbers(x=txt)
  txt <- tolower(x=txt)
  txt <- removeWords(x=txt, words=stopwords(kind="en"))
  txt <- removeWords(x=txt, words=profaneWords)
  txt <- stemDocument(x=txt)
  txt <- stripWhitespace(x=txt)
  return(txt)
}

# Implement backoff-model
nextWord <- function(inp_txt) { 
  
  # Extract usable parts from the input text
  inp_txt <- txtTransfrom(txt=inp_txt)
  phrase <- strsplit(x=inp_txt, split=" ")[[1]]
  if (length(phrase) == 0) { return("No text is detected after cleaning!") }
  else if ((length(unique(phrase)) == 1) & (unique(phrase) == "")) { return("Only spaces left after cleaning!") }
  else if (length(phrase) > 5) {phrase <- phrase[1:5]}
  
  # Search for entries containing the phrase of length n in stored (n+1)-grams
  searchOrder <- length(phrase):1
  for (i in seq_along(searchOrder)) {
    phrase_str <- paste(phrase[i:length(phrase)], collapse=" ")
    docTermMat <- readRDS(file=nGramsList[searchOrder[i]])
    grams <- docTermMat$dimnames[["Terms"]]
    entries <- grams[grepl(pattern=phrase_str, x=grams, ignore.case=T)]
    if (length(entries) == 0) next else break }
  if (length(entries) == 0) { return("No similar phrase found in training data, sorry!") }

  # Create a corpus from the entries containing the phrase
  regex_str <- paste(phrase_str, "([^ ]+)")
  targetWords <- ''
  for (i in 1:length(entries)) { 
    match_idx <- regexec(pattern=regex_str, text=entries[i], ignore.case=T)
    targetWords <- c(targetWords, regmatches(x=entries[i], m=match_idx)[[1]][2]) }
  targetWords <- targetWords[(!is.na(targetWords)) & (targetWords != "")]
  if (length(targetWords) == 0) { return("No similar phrase found in training data, sorry!") }
  corp <- VCorpus(VectorSource(data.frame(targetWords)))
  
  # Compare probability for each unigram and return the most likely word
  targetDTM <- as.matrix(x=DocumentTermMatrix(corp))
  freq <- sort(colSums(x=targetDTM), decreasing=T)
  df <- data.frame(Word=names(freq), Counts=freq, Probability=freq/length(freq))
  rownames(df) <- 1:length(freq)
  return(stemCompletion(x=df[1, 1], dictionary=unstemTrainCorp, type="first")[[1]])
}

Estimate Computational Complexity

Using the same phrases for the previous model, this model was also timed for comparison:

system.time(expr=nextWord("in a case of"))

##    user  system elapsed 
##    0.20    0.00    0.24

system.time(expr=nextWord("a case of"))

##    user  system elapsed 
##    0.16    0.02    0.17

system.time(expr=nextWord("case of"))

##    user  system elapsed 
##    0.14    0.00    0.14

The trend of increment in complexity as the length of input text increases is clearly shown. The time taken by the model to make a prediction for the same phrases used in previous model is shorter by about 3 orders of magnitude. This indicates the success of computing and storing the n-grams outside of the model in reducing the computational complexity of the problem.

Test the Model

The quiz questions are answered using the prediction function defined:

# Answer quiz questions
predictions <- ''
predictions[1] <- nextWord("I'd live and I'd")
predictions[2] <- nextWord("telling me about his")
predictions[3] <- nextWord("see arctic monkeys this")
predictions[4] <- nextWord("helps reduce your")
predictions[5] <- nextWord("to take a")
predictions[6] <- nextWord("to settle the")
predictions[7] <- nextWord("groceries in each")
predictions[8] <- nextWord("bottom to the")
predictions[9] <- nextWord("bruises from playing")
predictions[10] <- nextWord("of Adam Sandler's")
predictions

##  [1] "No similar phrase found in training data, sorry!"
##  [2] "blunt"                                           
##  [3] "like"                                            
##  [4] "costs"                                           
##  [5] "action"                                          
##  [6] "internetz"                                       
##  [7] "half"                                            
##  [8] "heavily"                                         
##  [9] "games"                                           
## [10] "one"

# Compute accuracy
answers <- c("die", "marital", "week end", "stress", "picture", "matter", "hand", "top", "outside", "movies")
correctNum <- 0
for (i in 1:length(predictions)) { if (predictions[i] == answers[i]) { correctNum <- correctNum + 1} }
correctNum / length(predictions)

## [1] 0

Conclusion

Using a smaller list of profane words would reduce some processing time of the model.
The uncertainty of the word prediction is reflected by the probability of the word returned.
The n-gram backoff-model is inefficient as each n-gram has to be stored exterior of the model. This is rather inflexible and will tend to miss out long-term dependency between words in a long phrase. The model could not make use of the context in the earlier part of an overly-long (in this case, more than 6 words) sentence.
Some of the most commonly missed n-grams are phrases that contain apostrophes. For instance, “I’d” in the first quiz question. This could potentially be fixed by using a larger training set.
Validation of the model has not been carried out.

Data Science Capstone Quiz 3

Jon Ting