This is final project after nine-course data science series supported by SwiftKey. It is about building a model based on given corpus to predict word to be typed next.
Raw text data contain blogs, news and twitters. Each of them is about 200 MB. Due to the large volume of data (for R), I only sampled 20% to speed up the loading and predicting process.
library(tm)
library(RWeka)
getSources()
getReaders()
cname <- file.path(".", "sampled10", "sample05")
length(dir(cname))
dir(cname)
docs <- Corpus(DirSource(cname))
class(docs)
Numbers, symbols and punctuations are removed from corpus except apostrophes in contractions are handled with special care. All alphabetic characters are turned into lower case. Note that Stopwords are not removed to keep sentences and phrases complete.
docs <- tm_map(docs, tolower, mc.cores = 1)
docs <- tm_map(docs, removeNumbers, mc.cores = 1)
docs <- tm_map(docs, stripWhitespace, mc.cores = 1)
#docs <- tm_map(docs, removeWords, stopwords("english"), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("‘|`|’", "'", x), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("\u0094", "", x), mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("([a-z])'([a-z])", "\\1ZZZ\\2", x), mc.cores = 1) # protect contractions
docs <- tm_map(docs, removePunctuation, mc.cores = 1)
docs <- tm_map(docs, function(x) gsub("ZZZ", "'", x), mc.cores = 1) # restore apostrophe
The sampled corpus are tokenized into unigrams, bigrams and trigrams.
options(mc.cores=1)
# generate unigram
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1, delimiters = ' '))
tdmUnitoken <- TermDocumentMatrix(docs, control = list(tokenize = UnigramTokenizer, tolower = FALSE, removePunctuation = FALSE))
freqUnitoken <- rowSums(as.matrix(tdmUnitoken))
freqUnitoken <- sort(freqUnitoken[freqUnitoken>1], decreasing = TRUE)
unigram <- data.frame(token = names(freqUnitoken), count = freqUnitoken)
write.csv(file = "tokens/unigram5.csv", x = unigram, row.names = FALSE)
# generate bigram
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters = ' '))
tdmBitoken <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer, tolower = FALSE, removePunctuation = FALSE))
freqBitoken <- rowSums(as.matrix(tdmBitoken))
freqBitoken <- sort(freqBitoken[freqBitoken > 1], decreasing = TRUE)
bigram <- data.frame(token = names(freqBitoken), count = freqBitoken)
write.csv(file = "tokens/bigram5.csv", x = bigram, row.names = FALSE)
# generate trigram
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3, delimiters = ' '))
tdmTritoken <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer, tolower = FALSE))
freqTritoken <- rowSums(as.matrix(tdmTritoken))
freqTritoken <- sort(freqTritoken[freqTritoken > 1], decreasing = TRUE)
trigram <- data.frame(token = names(freqTritoken), count = freqTritoken)
write.csv(file = "tokens/trigram5.csv", x = trigram, row.names = FALSE)
It takes about one minute to load tokens.
unigram.data.frame <- read.csv("unigram.csv")
freqUnitoken <- unigram.data.frame$count
freqUnitoken <- freqUnitoken / max(freqUnitoken)
names(freqUnitoken) <- unigram.data.frame$token
bigram.data.frame <- read.csv("bigram.csv")
freqBitoken <- bigram.data.frame$count
freqBitoken <- freqBitoken / max(freqBitoken)
names(freqBitoken) <- bigram.data.frame$token
trigram.data.frame <- read.csv("trigram.csv")
freqTritoken <- trigram.data.frame$count
freqTritoken <- freqTritoken / max(freqTritoken)
names(freqTritoken) <- trigram.data.frame$token
As a proof-of-concept, only edit-distance-of-one typos are included which can cover about 80% of all typos. Typos can be corrected in four ways: replacing, inserting, deleting and transposing.
spellCorrection <- function(inputwd) {
charVec <- unlist(strsplit(inputwd, ""))
delete <- vector(mode="character", length=0)
transpose <- vector(mode="character", length=0)
replace <- vector(mode="character", length=0)
insert <- vector(mode="character", length=0)
### replace
for (i in 1:nchar(inputwd)) {
lettersMissOne <- letters[which(letters != charVec[i])]
replace <- c(replace,unlist(lapply(lettersMissOne, function(x) {charVec[i] <- x; return (paste(charVec, collapse=""))})))
}
### insert
for (i in 0:(nchar(inputwd)-1)) {
insert <- c(insert, unlist(lapply(letters, function(x) paste(append(charVec, x, after=i), collapse=""))))
}
if (nchar(inputwd) > 1) {
### delete
for (i in 1:nchar(inputwd)) {
delete <- c(delete, paste(charVec[-i], collapse=""))
}
### transpose
for (i in 1:(nchar(inputwd)-1)) {
temp <- charVec
swap <- temp[i]
temp[i] <- temp[i+1]
temp[i+1] <- swap
transpose <- c(transpose, paste(temp, collapse=""))
}
}
return (c(replace, insert, delete, transpose))
}
selectFirstTen <- function(x) {
ifelse(length(x)>10, x<-x[1:10], x<-x)
return (x)
}
Back-off algorithm is implemented: Trigram list will be searched first, if the number of candidates is less than ten, bigram list will be searched, and then unigram until we have ten words to plot.
ptm <- proc.time()
inputStr <- "great ca "
specialChar <- "|\\r|\\n|\\t|\\.|,|;|:|\"|(|)|\\?|!|-|/|&|—|–|“|”|…|‘|@|<|£|€|½|~|_|\\[|\\]|\\{|\\}|•|`|\\^|=|>|♥|❤|☀|♪|♫|⁰|☛"
cleanedStr <- tolower(gsub(specialChar, "", inputStr))
punctuation <- c(".", ";", "!", "?")
if (inputStr == "") {
output <- freqUnitoken[1:10]
names(output) <- unlist(lapply(names(freqUnitoken[1:10]), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
} else {
lastChar <- substr(inputStr, nchar(inputStr), nchar(inputStr))
inputVec <- unlist(strsplit(inputStr, ' '))
cleanedVec <- unlist(strsplit(cleanedStr, ' '))
if (length(inputVec) == 1) {
if (lastChar == " ") {
bigram <- cleanedVec[length(cleanedVec)]
bigramRegex <- paste("^", bigram, " ", sep="")
matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
unigramCandidate <- freqUnitoken[1:10]
if (length(bigramCandidate) != 0)
unigramCandidate <- unigramCandidate*bigramCandidate[length(bigramCandidate)]
mergedCandidate <- c(bigramCandidate, unigramCandidate)
candidate <- mergedCandidate[unique(c(names(bigramCandidate), names(unigramCandidate)))][1:10]
if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
} else {
typingWord <- cleanedVec[length(cleanedVec)]
errorWords <- spellCorrection(typingWord)
knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
typingWordRegex <- paste("^", tolower(typingWord), sep="")
matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)
unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)
if (length(unigramCandidate) >= 10) {
candidate <- unigramCandidate[1:10]
} else {
zerogramCandidate <- freqUnitoken[1:(10+length(unigramCandidate))]
if (length(unigramCandidate) != 0)
zerogramCandidate <- zerogramCandidate/max(zerogramCandidate)*unigramCandidate[length(unigramCandidate)]
mergedCandidate <- c(unigramCandidate, zerogramCandidate)
candidate <- mergedCandidate[unique(c(names(unigramCandidate), names(zerogramCandidate)))][1:10]
}
if (substr(inputStr, 1, 1) %in% LETTERS)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
}
} else if (length(inputVec) == 2) {
if (lastChar == " ") {
trigramRegex <- paste("^", cleanedStr, sep="")
matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
if (length(trigramCandidate) >= 10) {
candidate <- trigramCandidate[1:10]
} else {
bigram <- cleanedVec[length(cleanedVec)]
bigramRegex <- paste("^", bigram, " ", sep="")
matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
mergedCandidate <- c(trigramCandidate, bigramCandidate)
mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
if (length(mergedCandidate) >= 10) {
candidate <- mergedCandidate[1:10]
} else {
unigramCandidate <- freqUnitoken[1:(10+length(mergedCandidate))]*mergedCandidate[length(mergedCandidate)]
mergedCandidate <- c(trigramCandidate, bigramCandidate, unigramCandidate)
candidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate), names(unigramCandidate)))][1:10]
}
}
if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
} else { ### length of inputVec is 2 and lastChar is NOT " ".
typingWord <- cleanedVec[length(cleanedVec)]
bigramRegex <- paste("^", cleanedStr, sep="")
matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
bigramCandidate <- selectFirstTen(bigramCandidate)
names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
if (length(bigramCandidate) >= 10) {
candidate <- bigramCandidate[1:10]
} else {
errorWords <- spellCorrection(typingWord)
knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
typingWordRegex <- paste("^", typingWord, sep="")
matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)
unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)
if (length(bigramCandidate) != 0)
unigramCandidate <- unigramCandidate/max(unigramCandidate)*bigramCandidate[length(bigramCandidate)]
mergedCandidate <- c(bigramCandidate, unigramCandidate)
candidate <- mergedCandidate[unique(c(names(bigramCandidate), names(unigramCandidate)))][1:10]
}
if (substr(inputVec[length(inputVec)], 1, 1) %in% LETTERS)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
} ### length of inputVec is 2 and lastChar is NOT " ".
} else { ### length of inputVec is larger than 2.
if (lastChar == " ") { ### length of inputVec is larger than 2 and lastChar is " ".
trigram <- paste(cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
trigramRegex <- paste("^", trigram, " ", sep="")
matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
if (length(trigramCandidate) >= 10) {
candidate <- trigramCandidate[1:10]
} else {
bigram <- cleanedVec[length(cleanedVec)]
bigramRegex <- paste("^", bigram, " ", sep="")
matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
if (length(trigramCandidate) != 0)
bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
mergedCandidate <- c(trigramCandidate, bigramCandidate)
mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
if (length(mergedCandidate) >= 10) {
candidate <- mergedCandidate[1:10]
} else {
unigramCandidate <- freqUnitoken[1:10]
if (length(mergedCandidate) != 0)
unigramCandidate <- unigramCandidate/max(unigramCandidate)*mergedCandidate[length(mergedCandidate)]
mergedCandidate2 <- c(mergedCandidate, unigramCandidate)
mergedCandidate2 <- mergedCandidate2[unique(c(names(mergedCandidate), names(unigramCandidate)))]
candidate <- mergedCandidate2[1:10]
}
}
if (substr(inputStr, nchar(inputStr)-1, nchar(inputStr)-1) %in% punctuation)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
} else { ### length of inputVec is larger than 2 and lastChar is NOT " ".
typingWord <- cleanedVec[length(cleanedVec)]
trigram <- paste(cleanedVec[length(cleanedVec)-2], cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
trigramRegex <- paste("^", trigram, sep="")
matchedIdxTri <- grep(trigramRegex, names(freqTritoken), perl= TRUE)
trigramCandidate <- sort(freqTritoken[matchedIdxTri], decreasing = TRUE)
names(trigramCandidate) <- unlist(lapply(names(trigramCandidate), function(x) unlist(strsplit(x, ' '))[3]))
if (length(trigramCandidate) >= 10) {
candidate <- trigramCandidate[1:10]
} else {
bigram <- paste(cleanedVec[length(cleanedVec)-1], cleanedVec[length(cleanedVec)])
bigramRegex <- paste("^", bigram, sep="")
matchedIdxBi <- grep(bigramRegex, names(freqBitoken), perl= TRUE)
bigramCandidate <- sort(freqBitoken[matchedIdxBi], decreasing = TRUE)
names(bigramCandidate) <- unlist(lapply(names(bigramCandidate), function(x) unlist(strsplit(x, ' '))[2]))
if (length(trigramCandidate) != 0)
bigramCandidate <- bigramCandidate/max(bigramCandidate)*trigramCandidate[length(trigramCandidate)]
mergedCandidate <- c(trigramCandidate, bigramCandidate)
mergedCandidate <- mergedCandidate[unique(c(names(trigramCandidate), names(bigramCandidate)))]
if (length(mergedCandidate) >= 10) {
candidate <- mergedCandidate[1:10]
} else {
errorWords <- spellCorrection(typingWord)
knownWords <- sort(freqUnitoken[intersect(errorWords, names(freqUnitoken))], decreasing=TRUE)
typingWordRegex <- paste("^", typingWord, sep="")
matchedIdx <- grep(typingWordRegex, names(freqUnitoken), perl= TRUE)
unigramCandidate <- sort(freqUnitoken[unique(c(names(freqUnitoken[matchedIdx]), names(knownWords)))], decreasing=TRUE)
if (length(mergedCandidate) != 0)
unigramCandidate <- unigramCandidate/max(unigramCandidate)*mergedCandidate[length(mergedCandidate)]
mergedCandidate2 <- c(mergedCandidate, unigramCandidate)
mergedCandidate2 <- mergedCandidate2[unique(c(names(mergedCandidate), names(unigramCandidate)))]
if (length(mergedCandidate2) >= 10) {
candidate <- mergedCandidate2[1:10]
} else {
zerogramCandidate <- freqUnitoken[1:10]
if (length(mergedCandidate2) != 0)
zerogramCandidate <- zerogramCandidate/max(zerogramCandidate)*mergedCandidate2[length(mergedCandidate2)]
mergedCandidate3 <- c(mergedCandidate2, zerogramCandidate)
mergedCandidate3 <- mergedCandidate3[unique(c(names(mergedCandidate2), names(zerogramCandidate)))]
candidate <- mergedCandidate3[1:10]
}
}
}
if (substr(inputVec[length(inputVec)], 1, 1) %in% LETTERS)
names(candidate) <- unlist(lapply(names(candidate), function(x) paste(toupper(substr(x, 1, 1)),substr(x, 2, nchar(x)), sep = "")))
output <- candidate
}
}
}
library(wordcloud)
library(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(names(output),sqrt(output), scale=c(10,0.5),min.freq=0, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)