Goal of this project is to produce word prediction application for end user on web.
This report contains:
Once we download Capstone Dataset we need load some data for analisys. Because of big size of text files we load from each file only 3% sample.
readText <- function(files, prop) {
text <- NULL; rn <- 0
for (i in files) {
file <- file(description = i, "r")
n <- 0
while(TRUE){
ln <- readLines(file, 1, skipNul = T); if (length(ln) == 0 ) break
if (rbinom(1,1,0.03) == 1)
{
src <- i
if(grepl(".blogs.", i, fixed = T)) src <- "blogs"
if(grepl(".news.", i, fixed = T)) src <- "news"
if(grepl(".twitter.", i, fixed = T)) src <- "twitter"
n <- n+1; rn <- rn + 1
text <- rbind(text, tibble(doc_id= rn, text = ln, line= n, source = src ))
}
}
close(file);
}
text
}
# take a text sample
text <- as.tbl(readText(c( "data/en_US.news.txt" , "data/en_US.blogs.txt", "data/en_US.twitter.txt"), 0.03 ))
Now we have the text with 127897 lines from 3 sources. Let’s dive a little into this text. We use tidytext package for the exploratory analysis.
# split text by words
tidytxt <- text %>% unnest_tokens(word, text)
# use english common words dictionary with no or little meaning
# and Google bad words dictionary for profanity filtering
tidytxt <- tidytxt %>% anti_join(stop_words) %>% anti_join( ProfanityFilter() )
# remove digits, spec simbols and non-english simbols, that's how we evaluate words from foreign languages?
tidytxt$word <- rmSpec(tidytxt$word)
# filter empty words and some new stopwords
mystopwords <- tibble(word = c("", "i", "it", "pm", "you", "ve", "we", "rt", "ll", "th", "st", "they", "gt"))
tidytxt <- tidytxt %>% anti_join(mystopwords)
# count a frequent words
tidytxtFreq <- tidytxt %>% count(word, sort = TRUE) %>% mutate(proportion = cumsum(n / sum(n)) )
First of all let`s look on the words frequency. Total unique words amount is 89617, but only 1644 words cover 50% of all word instances in the corpus and 19385 words cover 90%.
Now let’s look on the some most common words in corpus. Word size represent it’s frequency in text.
And as last step in single words analysis, let’s look how words differ over sources.
You can see, that many words are the same in blogs, news and twitter, but proportion of the words can variate between sources.
Now let’s see what we can find in bigrams and trigrams in the corpus. We use the same instruments to produce this analysis and then add some ggplot graphs. I turn off code output that looks more or less as code in single words analisys.
bigrams <- text %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% filter(!is.na(bigram))
bigrams_sep <- bigrams %>% separate(bigram, c("word1", "word2"), sep = " ")
treegrams <- text %>% unnest_tokens(treegram, text, token = "ngrams", n = 3) %>% filter(!is.na(treegram))
treegrams_sep <- treegrams %>% separate(treegram, c("word1", "word2", "word3"), sep = " ")
First model that comes to mind is bigrams and trigrams dictionary sorted by words frequency in the text.
clearText <- text
clearText$text <- text$text %>% tolower() %>% skipWords %>% rmSpec() %>% skipWords %>% stripWhitespace()
# filter empty words and some new stopwords
mystopwords <- tibble(word = c("", "i", "it", "pm", "you", "ve", "we", "rt",
"ll", "th", "st", "they", "gt"))
# One word dictionary
OneWordDict <- clearText %>%
unnest_tokens(word, text) %>%
anti_join( ProfanityFilter() ) %>%
count(word, sort = T)
# bigram dictionary
BigramDict <- clearText %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% ProfanityFilter()) %>% filter( !word1 %in% mystopwords$word) %>%
filter(!word2 %in% ProfanityFilter()) %>% filter( !word2 %in% mystopwords$word) %>%
count(word1, word2, sort = T) %>%
group_by(word1) %>%
top_n(3)
# trigram dictionary
TrigramDict <- clearText %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
filter(!is.na(trigram)) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% ProfanityFilter()) %>% filter( !word1 %in% mystopwords$word) %>%
filter(!word2 %in% ProfanityFilter()) %>% filter( !word2 %in% mystopwords$word) %>%
filter(!word3 %in% ProfanityFilter()) %>% filter( !word3 %in% mystopwords$word) %>%
count(word1, word2, word3, sort = T) %>%
group_by(word1, word2) %>%
top_n(1)
We use the most popular combinations (top three bigrams and top one trigrams) to predict the next words in the text that the user send to the model. This approach allows compress text amout that we need for prediction in contast with raw data, but it still use 29.4 Mb for bigrams and 146.8 Mb for trigrams.
Model works like that:
nextWord <- function(sent = NULL){
if (is.null(sent))
{ res <- sample( OneWordDict$word, 1) ; return(res) }
sent <- sent %>% tolower() %>% skipWords %>% rmSpec() %>% skipWords %>% stripWhitespace()
#print(sent)
words <- tibble(text= sent) %>% unnest_tokens(word, text)
nn <- nrow(words)
res <- BigramDict[BigramDict$word1 == words$word[nn], ]$word2
if (nn >=2)
res <- unique( c(
TrigramDict[TrigramDict$word1 == words$word[nn-1] & TrigramDict$word2 == words$word[nn], ]$word3[1:2],
res ) )
res <- res[!is.na(res)]
res <- res[1:min(length(res), 3)]
res
}
Let’s look on some results
nextWord("World")
## [1] "series" "war" "peace"
nextWord("World War")
## [1] "ii" "afghanistan" "terror"
x1 <- nextWord()
x1
## [1] "swayers"
x2 <- nextWord(x1)
x2
## [1] "dancers"
nextWord(paste(x1, x2[1]))
## [1] "shouters" "can" "also"
Next steps: