knitr::opts_chunk$set(echo=T)
suppressPackageStartupMessages(library(tm))
blogs <- readLines("../final/en_US/en_US.blogs.txt", skipNul=T, encoding="UTF-8")
news <- readLines("../final/en_US/en_US.news.txt", skipNul=T, encoding="UTF-8")
twits <- readLines("../final/en_US/en_US.twitter.txt", skipNul=T, encoding="UTF-8")
eng_data <- c(blogs, news, twits)
saveRDS(object=eng_data, file="eng_data.rds")
# Define helper function & read helper file
delPat <- content_transformer(function(strng, pat) gsub(pattern=pat, replacement=" ", x=strng))
profane_words <- read.delim(file="../Milestone/badwords.txt", header=F)[, 1]
nextWord <- function(inp_txt) {
# Create a corpus from the entries containing the input text
entries <- eng_data[grepl(pattern=inp_txt, x=eng_data, ignore.case=T)]
if (length(entries) == 0) { return(data.frame(Word="-", Counts="None")) }
regex_str <- paste(inp_txt, "([^ ]+)")
targetWords <- ''
for (i in 1:length(entries)) {
match_idx <- regexec(pattern=regex_str, text=entries[i], ignore.case=T)
targetWords <- c(targetWords, regmatches(x=entries[i], m=match_idx)[[1]][2]) }
corp <- VCorpus(VectorSource(data.frame(targetWords)))
# Clean the corpus
corp <- tm_map(x=corp, FUN=delPat, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corp <- tm_map(x=corp, FUN=delPat, "[^a-zA-Z ]")
corp <- tm_map(x=corp, FUN=removePunctuation)
corp <- tm_map(x=corp, FUN=removeNumbers)
corp <- tm_map(x=corp, FUN=content_transformer(FUN=tolower))
corp <- tm_map(x=corp, FUN=removeWords, stopwords(kind="en"))
corp <- tm_map(x=corp, FUN=removeWords, profane_words)
corp <- tm_map(x=corp, FUN=stripWhitespace)
corp <- tm_map(x=corp, FUN=PlainTextDocument)
# Compute frequencies of each word/unigram
docTermMat <- as.matrix(x=DocumentTermMatrix(corp))
freq <- sort(colSums(x=docTermMat), decreasing=T)
df <- data.frame(Word=names(freq), Counts=freq)
rownames(df) <- 1:length(freq)
numWord <- max(1, min(length(freq), 10))
return(df[1:numWord, ])
}
The function was timed for different length of input text:
system.time(expr=nextWord("in a case of"))
## user system elapsed
## 62.27 5.20 93.03
system.time(expr=nextWord("a case of"))
## user system elapsed
## 49.16 0.23 51.56
system.time(expr=nextWord("case of"))
## user system elapsed
## 53.20 0.10 58.47
The complexity of the function is found to be proportional to the length of the provided text in general. However, input phrases consisting of 3 words seem to provide some compensations in this case.
The quiz questions are answered using the prediction function defined:
nextWord("a case of")
## Word Counts
## 1 beer 19
## 2 wine 7
## 3 first 4
## 4 miller 4
## 5 mistaken 4
## 6 making 3
## 7 one 3
## 8 waiting 3
## 9 water 3
## 10 crossed 2
nextWord("would mean the")
## Word Counts
## 1 world 203
## 2 absolute 2
## 3 end 2
## 4 entire 2
## 5 airport 1
## 6 angles 1
## 7 central 1
## 8 century 1
## 9 death 1
## 10 person 1
nextWord("make me the")
## Word Counts
## 1 happiest 28
## 2 best 2
## 3 worlds 2
## 4 bad 1
## 5 biggest 1
## 6 blame 1
## 7 bun 1
## 8 daughter 1
## 9 first 1
## 10 girl 1
nextWord("struggling but")
## Word Counts
## 1 avoiding 1
## 2 remember 1
## 3 westbrook 1
nextWord("date at the")
## Word Counts
## 1 end 5
## 2 time 3
## 3 app 1
## 4 art 1
## 5 bottom 1
## 6 braves 1
## 7 cake 1
## 8 cheese 1
## 9 driskill 1
## 10 four 1
nextWord("be on my")
## Word Counts
## 1 way 36
## 2 mind 14
## 3 show 8
## 4 list 7
## 5 side 7
## 6 best 5
## 7 game 5
## 8 ipod 5
## 9 couch 4
## 10 radio 4
nextWord("in quite some")
## Word Counts
## 1 time 36
nextWord("with his little")
## Word Counts
## 1 brother 4
## 2 bedroom 1
## 3 bitty 1
## 4 brothers 1
## 5 claws 1
## 6 company 1
## 7 cousins 1
## 8 daughter 1
## 9 dog 1
## 10 embroidered 1
nextWord("faith during the")
## Word Counts
## 1 worship 1
nextWord("you must be")
## Word Counts
## 1 able 29
## 2 willing 12
## 3 bored 9
## 4 really 9
## 5 wondering 9
## 6 born 8
## 7 one 8
## 8 tired 8
## 9 watching 7
## 10 new 6