knitr::opts_chunk$set(warning=FALSE, message=FALSE)
Load libraries.
libraries <- c("stringr", "tm", "dplyr", "ggplot2", "RWeka")
for (lib in libraries) {
if (!require(lib, character.only = TRUE)){
install.packages(lib)
}
library(lib, character.only=TRUE)
}
Download data.
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file <- "Coursera-SwiftKey.zip"
if(!file.exists(file)) {
download.file(url, dest="Coursera-SwiftKey.zip", mode="wb")
unzip ("Coursera-SwiftKey.zip", exdir = ".")
}
data_folder <- "final/en_US/"
twitter <-readLines(paste(data_folder,"en_US.twitter.txt",sep = ""),warn=FALSE)
blog <-readLines(paste(data_folder,"en_US.blogs.txt",sep = ""),warn=FALSE)
news <-readLines(paste(data_folder,"en_US.news.txt",sep = ""),warn=FALSE)
Count lines read from files. Twitter file has 2360148 lines, blog file has 899288 lines and news file has 77259 with gives 3336695 lines in total.
Take subset of data due performance problems.
sample_size <- 1500
twitter <- sample(twitter, sample_size)
blog <- sample(blog, sample_size)
news <- sample(news, sample_size)
all <-c(twitter,blog,news)
Generate corpus function.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
generate_corpus <- function(text,stop_words=""){
Corpus <- VCorpus(VectorSource(paste0(text)))
corpus <- tm_map(Corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(Corpus, toSpace, "@[^\\s]+")
Corpus <- tm_map(Corpus, removePunctuation)
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, stripWhitespace)
Corpus <- tm_map(Corpus, content_transformer(tolower))
Corpus <- tm_map(Corpus, removeWords, stop_words)
Corpus
}
Tokenizer functions.
uniTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
biTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
triTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
modelTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=3))
function. that generate word frequency matrix from corpus.
word_freq <- function(corpus, control_list) {
tdm <- as.matrix(TermDocumentMatrix(corpus, control=control_list))
FreqMat <- data.frame(ST = rownames(tdm),
Freq = rowSums(tdm),
row.names = NULL)
FreqMat[order(-FreqMat$Freq),]
}
Simple and most naive word prediction model. It predict word based only on frequencies of 2 and 3 grams.
create_model <- function(text, max_words=3){
print("Creating model")
corpus <- generate_corpus(text, stop_words = stopwords("en"))
one_grams <- word_freq(corpus, list(tokenize = uniTokenizer))
n_grams <- word_freq(corpus, list(tokenize = modelTokenizer))
model <- function(word){
words_grep <- n_grams[grep((paste(word,"\\s", sep = "")), n_grams$ST),]
count <- max_words
words <- ""
if(nrow(words_grep) >0){
for(str in words_grep[1:max_words, "ST"]){
next_word <-str_extract(str, paste('(?<=',word,'\\s)\\w+',sep = ""))
words <- paste(words, next_word)
count <- count -1
}
}
if (count > 1){
for (i in 1:count) {
res <- sample_n(one_grams, 1, weight = one_grams$Freq)
words <- paste(words, res$ST)
}
}
return(words)
}
model
}
Generate corpus.
all_corps <- generate_corpus(all, stop_words = stopwords("en"))
Top ten two-grams.
all_2_gram <- word_freq(all_corps, list(tokenize = biTokenizer))
ggplot(all_2_gram[1:10,], aes(x= Freq,y=reorder(ST, Freq))) +
geom_bar(fill="red",stat="identity") +
ggtitle("Top 10 two grams") +
xlab("Frequency") +
ylab("Two gram") +
theme_dark()
Top ten three-grams.
all_3_gram <- word_freq(all_corps, list(tokenize = triTokenizer))
ggplot(all_3_gram[1:10,], aes(x= Freq,y=reorder(ST, Freq))) +
geom_bar(fill="red",stat="identity") +
ggtitle("Top 10 three grams") +
xlab("Frequency") +
ylab("three gram") +
theme_dark()
Top ten words.
all_1_gram <- word_freq(all_corps, list(tokenize = uniTokenizer))
ggplot(all_1_gram[1:10,], aes(x= Freq,y=reorder(ST, Freq))) +
geom_bar(fill="red",stat="identity") +
ggtitle("Top 10 words") +
xlab("Frequency") +
ylab("word") +
theme_dark()
Model should return three next word suggestions (duplicated words are possible).
model <- create_model(all)
## [1] "Creating model"
model("north")
## [1] " dakota dakota america"
model("love")
## [1] " reading u every"
model("man")
## [1] " said services year"
model("test")
## [1] " guy comedies love"
Performance of simple naive model is really poor, to make it works better we can use more data but n-grams size grows exponentially so it is not practical solution. Best solution seems to be deep-learning model although is still time consuming and needs a lot of data.