This is final project after nine-course data science series supported by SwiftKey. It is about building a model based on given corpus to predict word to be typed next.

Data cleaning and tokenization

Raw text data contain blogs, news and twitters. Each of them is about 200 MB. Due to the large volume of data (for R), I only sampled 20% to speed up the loading and predicting process.

library(tm)
library(RWeka)
getSources()
getReaders()
cname <- file.path(".", "sampled10", "sample05")
length(dir(cname))
dir(cname)
docs <- Corpus(DirSource(cname))
class(docs)

Numbers, symbols and punctuations are removed from corpus except apostrophes in contractions are handled with special care. All alphabetic characters are turned into lower case. Note that Stopwords are not removed to keep sentences and phrases complete.

docs <- tm_map(docs, tolower, mc.cores = 1)
docs <- tm_map(docs, removeNumbers, mc.cores = 1)
docs <- tm_map(docs, stripWhitespace, mc.cores = 1)
#docs <- tm_map(docs, removeWords, stopwords("english"), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("‘|`|’", "'", x), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("\u0094", "", x), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("([a-z])'([a-z])", "\\1ZZZ\\2", x), mc.cores = 1) # protect contractions
docs <- tm_map(docs, removePunctuation, mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("ZZZ", "'", x), mc.cores = 1) # restore apostrophe

The sampled corpus are tokenized into unigrams, bigrams and trigrams.

options(mc.cores=1)
# generate unigram
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1, delimiters = ' '))
tdmUnitoken <- TermDocumentMatrix(docs, control = list(tokenize = UnigramTokenizer, tolower = FALSE, removePunctuation = FALSE))
freqUnitoken <- rowSums(as.matrix(tdmUnitoken))
freqUnitoken <- sort(freqUnitoken[freqUnitoken>1], decreasing = TRUE)
unigram <- data.frame(token = names(freqUnitoken), count = freqUnitoken)
write.csv(file = "tokens/unigram5.csv", x = unigram, row.names = FALSE)
# generate bigram
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters = ' '))
tdmBitoken <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer, tolower = FALSE, removePunctuation = FALSE))
freqBitoken <- rowSums(as.matrix(tdmBitoken))
freqBitoken <- sort(freqBitoken[freqBitoken > 1], decreasing = TRUE)
bigram <- data.frame(token = names(freqBitoken), count = freqBitoken)
write.csv(file = "tokens/bigram5.csv", x = bigram, row.names = FALSE)
# generate trigram
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3, delimiters = ' '))
tdmTritoken <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer, tolower = FALSE))
freqTritoken <- rowSums(as.matrix(tdmTritoken))
freqTritoken <- sort(freqTritoken[freqTritoken > 1], decreasing = TRUE)
trigram <- data.frame(token = names(freqTritoken), count = freqTritoken)
write.csv(file = "tokens/trigram5.csv", x = trigram, row.names = FALSE)

Modeling

Loading grams

It takes about one minute to load tokens.

unigram.data.frame <- read.csv("unigram.csv")
freqUnitoken <- unigram.data.frame$count
freqUnitoken <- freqUnitoken / max(freqUnitoken)
names(freqUnitoken) <- unigram.data.frame$token
bigram.data.frame <- read.csv("bigram.csv")
freqBitoken <- bigram.data.frame$count
freqBitoken <- freqBitoken / max(freqBitoken)
names(freqBitoken) <- bigram.data.frame$token
trigram.data.frame <- read.csv("trigram.csv")
freqTritoken <- trigram.data.frame$count
freqTritoken <- freqTritoken / max(freqTritoken)
names(freqTritoken) <- trigram.data.frame$token

Checking spelling

As a proof-of-concept, only edit-distance-of-one typos are included which can cover about 80% of all typos. Typos can be corrected in four ways: replacing, inserting, deleting and transposing.

spellCorrection <- function(inputwd) {
  charVec <- unlist(strsplit(inputwd, ""))
  delete <- vector(mode="character", length=0)
  transpose <- vector(mode="character", length=0)
  replace <- vector(mode="character", length=0)
  insert <- vector(mode="character", length=0)
  ### replace 
  for (i in 1:nchar(inputwd)) {
    lettersMissOne <- letters[which(letters != charVec[i])]
    replace <- c(replace,unlist(lapply(lettersMissOne, function(x) {charVec[i] <- x; return (paste(charVec, collapse=""))})))
  }
  
  ### insert
  for (i in 0:(nchar(inputwd)-1)) {
    insert <- c(insert, unlist(lapply(letters, function(x) paste(append(charVec, x, after=i), collapse=""))))
  }
  
  if (nchar(inputwd) > 1) {
    ### delete
    for (i in 1:nchar(inputwd)) {
      delete <- c(delete, paste(charVec[-i], collapse=""))
    }
    
    ### transpose
    for (i in 1:(nchar(inputwd)-1)) {
      temp <- charVec
      swap <- temp[i]
      temp[i] <- temp[i+1]
      temp[i+1] <- swap
      transpose <- c(transpose, paste(temp, collapse=""))
    }
  }
  return (c(replace, insert, delete, transpose))
}

selectFirstTen <- function(x) {
  ifelse(length(x)>10, x<-x[1:10], x<-x)
  return (x)
}

predicting typing

Back-off algorithm is implemented: Trigram list will be searched first, if the number of candidates is less than ten, bigram list will be searched, and then unigram until we have ten words to plot.

      ptm <- proc.time()
      inputStr <- "great ca "
      specialChar <- "|\\r|\\n|\\t|\\.|,|;|:|\"|(|)|\\?|!|-|/|&|—|–|“|”|…|‘|@|<|£|€|½|~|_|\\[|\\]|\\{|\\}|•|`|\\^|=|>|♥|❤|☀|♪|♫|⁰|☛"
      cleanedStr <- tolower(gsub(specialChar, "", inputStr))
      punctuation <- c(".", ";", "!", "?")
      if (inputStr == "") {
        output <- freqUnitoken[1:10]
        names(output) <- unlist(lapply(names(freqUnitoken[1:10]), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
      } else {
        lastChar <- substr(inputStr, nchar(inputStr), nchar(inputStr))
        inputVec <- unlist(strsplit(inputStr, ' '))
        cleanedVec <- unlist(strsplit(cleanedStr, ' '))
        if (length(inputVec) == 1) {
          if (lastChar == " ") {
            bigram <- cleanedVec[length(cleanedVec)]
            bigramRegex <- paste("^", bigram, " ", sep="")
            matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
            bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
            names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
            unigramCandidate <- freqUnitoken[1:10]
            if (length(bigramCandidate) != 0)
              unigramCandidate <- unigramCandidate*bigramCandidate[length(bigramCandidate)]            
            mergedCandidate <- c(bigramCandidate, unigramCandidate)
            candidate <- mergedCandidate[unique(c(names(bigramCandidate), names(unigramCandidate)))][1:10]
            if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)    
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate
          } else {
            typingWord <- cleanedVec[length(cleanedVec)]
            errorWords <- spellCorrection(typingWord)
            knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
            typingWordRegex <- paste("^", tolower(typingWord), sep="")
            matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)          
            unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)
            if (length(unigramCandidate) >= 10) {
              candidate <- unigramCandidate[1:10]
            } else {
              zerogramCandidate  <- freqUnitoken[1:(10+length(unigramCandidate))]
              if (length(unigramCandidate) != 0)
                zerogramCandidate <- zerogramCandidate/max(zerogramCandidate)*unigramCandidate[length(unigramCandidate)]
              mergedCandidate <- c(unigramCandidate, zerogramCandidate)
              candidate <- mergedCandidate[unique(c(names(unigramCandidate), names(zerogramCandidate)))][1:10]
            }
            if (substr(inputStr, 1, 1) %in% LETTERS)
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate
          }
        } else if (length(inputVec) == 2) {
          if (lastChar == " ") {
            trigramRegex <- paste("^", cleanedStr, sep="")
            matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
            trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
            names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
            
            if (length(trigramCandidate) >= 10) {
              candidate <- trigramCandidate[1:10]
            } else {  
              bigram <- cleanedVec[length(cleanedVec)]
              bigramRegex <- paste("^", bigram, " ", sep="")
              matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
              bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
              names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
              bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
              mergedCandidate <- c(trigramCandidate, bigramCandidate)
              mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
              if (length(mergedCandidate) >= 10) {
                candidate <- mergedCandidate[1:10]
              } else {
                unigramCandidate <- freqUnitoken[1:(10+length(mergedCandidate))]*mergedCandidate[length(mergedCandidate)]            
                mergedCandidate <- c(trigramCandidate, bigramCandidate, unigramCandidate)
                candidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate), names(unigramCandidate)))][1:10]
              }
            }     
            if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)    
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate
          } else {  ### length of inputVec is 2 and lastChar is NOT " ".
            typingWord <- cleanedVec[length(cleanedVec)]
            bigramRegex <- paste("^", cleanedStr, sep="")
            matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
            bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
            bigramCandidate <- selectFirstTen(bigramCandidate)
            names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
            
            if (length(bigramCandidate) >= 10) {
              candidate <- bigramCandidate[1:10]
            } else {
              errorWords <- spellCorrection(typingWord)
              knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
              typingWordRegex <- paste("^", typingWord, sep="")
              matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)          
              unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)
              if (length(bigramCandidate) != 0)
                unigramCandidate <- unigramCandidate/max(unigramCandidate)*bigramCandidate[length(bigramCandidate)]
              mergedCandidate <- c(bigramCandidate, unigramCandidate)
              candidate <- mergedCandidate[unique(c(names(bigramCandidate), names(unigramCandidate)))][1:10]
            }
            if (substr(inputVec[length(inputVec)], 1, 1) %in% LETTERS)
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate            
          }    ### length of inputVec is 2 and lastChar is NOT " ".
        } else {  ### length of inputVec is larger than 2.
          if (lastChar == " ") {    ### length of inputVec is larger than 2 and lastChar is " ".
            trigram <- paste(cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
            trigramRegex <- paste("^", trigram, " ", sep="")
            matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
            trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
            names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
            
            if (length(trigramCandidate) >= 10) {
              candidate <- trigramCandidate[1:10]
            } else {
              bigram <- cleanedVec[length(cleanedVec)]
              bigramRegex <- paste("^", bigram, " ", sep="")
              matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
              bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
              names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
              if (length(trigramCandidate) != 0)
                bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
              mergedCandidate <- c(trigramCandidate, bigramCandidate)
              mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
              if (length(mergedCandidate) >= 10) {
                candidate <- mergedCandidate[1:10]
              } else {
                unigramCandidate <- freqUnitoken[1:10]
                if (length(mergedCandidate) != 0)
                  unigramCandidate <- unigramCandidate/max(unigramCandidate)*mergedCandidate[length(mergedCandidate)]
                mergedCandidate2 <- c(mergedCandidate, unigramCandidate)
                mergedCandidate2 <- mergedCandidate2[unique(c(names(mergedCandidate), names(unigramCandidate)))] 
                candidate <- mergedCandidate2[1:10]
              }
            }
            if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)    
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate
          } else {    ### length of inputVec is larger than 2 and lastChar is NOT " ".
            typingWord <- cleanedVec[length(cleanedVec)]
            trigram <- paste(cleanedVec[length(cleanedVec)-2], cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
            trigramRegex <- paste("^", trigram, sep="")
            matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
            trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
            names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
            
            if (length(trigramCandidate) >= 10) {
              candidate <- trigramCandidate[1:10]
            } else {
              bigram <- paste(cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
              bigramRegex <- paste("^", bigram, sep="")
              matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
              bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
              names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
              if (length(trigramCandidate) != 0)
                bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
              mergedCandidate <- c(trigramCandidate, bigramCandidate)
              mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
              if (length(mergedCandidate) >= 10) {
                candidate <- mergedCandidate[1:10]
              } else {
                errorWords <- spellCorrection(typingWord)
                knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
                typingWordRegex <- paste("^", typingWord, sep="")
                matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)          
                unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)              
                if (length(mergedCandidate) != 0)
                  unigramCandidate <- unigramCandidate/max(unigramCandidate)*mergedCandidate[length(mergedCandidate)]
                mergedCandidate2 <- c(mergedCandidate, unigramCandidate)
                mergedCandidate2 <- mergedCandidate2[unique(c(names(mergedCandidate), names(unigramCandidate)))] 
                if (length(mergedCandidate2) >= 10) {
                  candidate <- mergedCandidate2[1:10]
                } else {
                  zerogramCandidate <- freqUnitoken[1:10]
                  if (length(mergedCandidate2) != 0)
                    zerogramCandidate <- zerogramCandidate/max(zerogramCandidate)*mergedCandidate2[length(mergedCandidate2)]
                  mergedCandidate3 <- c(mergedCandidate2, zerogramCandidate)
                  mergedCandidate3 <- mergedCandidate3[unique(c(names(mergedCandidate2), names(zerogramCandidate)))]
                  candidate <- mergedCandidate3[1:10]
                }
              }
            }
            if (substr(inputVec[length(inputVec)], 1, 1) %in% LETTERS)
              names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
            output <- candidate
          }
        }
      }     
      library(wordcloud)
      library(RColorBrewer)
      pal2 <- brewer.pal(8,"Dark2")
      wordcloud(names(output),sqrt(output), scale=c(10,0.5),min.freq=0, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)