library(stringi)
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
setwd("/Users/rachitgupta/Desktop")
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Disk size (in MB)
blogs_dsize <- file.info("en_US.blogs.txt")$size / 1024 / 1024
news_dsize <- file.info("en_US.news.txt")$size / 1024 / 1024
twitter_dsize <- file.info("en_US.twitter.txt")$size / 1024 / 1024
#In-memory size (in MB)
blogs_msize<-object.size(blogs) / 1024 / 1024
news_msize<-object.size(news) / 1024 / 1024
twitter_msize<-object.size(twitter) / 1024 / 1024
# Words in lines
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
# Summary
data.frame(source = c("blogs", "news", "twitter"),
files_MB = c(blogs_dsize, news_dsize, twitter_dsize),
in_memory_MB = c(blogs_msize, news_msize, twitter_msize),
lines = c(length(blogs), length(news), length(twitter)),
words_num = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
mean_words_num = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
## source files_MB in_memory_MB lines words_num mean_words_num
## 1 blogs 200.4242 255.3545 899288 37546239 41.75107
## 2 news 196.2775 257.3404 1010242 34762395 34.40997
## 3 twitter 159.3641 318.9897 2360148 30093413 12.75065
I am cleaning data to remove some special characters, any numbers, punctuations, excess whitespace, then changing to lower case and remove stopwords.
Creating corpus is very labour-intensive operation and in our case it really halts the computer while trying to process nearly 1GB in-memory, so we are loading only 1% lines from each source of text data.
The goal here is to create different n-grams from this corpus and analyze frequency of words.
#bigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
#trigram_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
#RWeka controls failed to for me with strage instant error, so I found another tokenizers:
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
#Count words
freq_df <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
unigram <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
unigram_freq <- freq_df(unigram)
bigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)), 0.9999)
bigram_freq <- freq_df(bigram)
trigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer)), 0.9999)
trigram_freq <- freq_df(trigram)
freq_plot <- function(data, title) {
ggplot(data[1:25,], aes(reorder(word, -freq), freq)) +
labs(x = "Words/Phrases", y = "Frequency") +
ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) +
geom_bar(stat = "identity")
}
freq_plot(unigram_freq, "Top-25 Unigrams")
freq_plot(bigram_freq, "Top-25 Bigrams")
freq_plot(trigram_freq, "Top-25 Trigrams")
As mentioned above, loading and processing the dataset costs a lot of time. A huge amount of time to process corpus! We had to limit text data to perform analysis.
Stopwords are very important, they are fundamental part of language. We have to test and maybe include these words to our prediction algorythm.
Next step of this project is to build predictive algorithm, test it, surround it with ShinyApps-functions and deploy it to shinyapps.io
We have to find and to test some strategies of prediction of next word, now we are thinking about using trigram model, but it is not last solution - we need to test it! If trigram models fails to predict next word, we are switching to bigram model.