This capstone project involves using tm, Rweka and other R packages to analyze the text data obtained from Twitter, blogs and news to build an app that can predict the next word to input.
Firstly, let’s load all the packages we’re going to use in this project.
library(tm)
library(wordcloud)
library(RWeka)
library(dplyr)
library(slam)
library(qdap)
library(stringi)
library(ggplot2)twitter_con <- file('en_US.twitter.txt', 'r')
news_con <- file('en_US.news.txt', 'r')
blogs_con <- file('en_US.blogs.txt', 'r')
twitter <- readLines(twitter_con, skipNul = T)
close(twitter_con)
news <- readLines(news_con, skipNul = T, n = 77258)
close(news_con)
blogs <- readLines(blogs_con, skipNul = T)
close(blogs_con)Let’s make a summary of each data
words_twitter <- stri_count_words(twitter)
words_blogs <- stri_count_words(blogs)
words_news <- stri_count_words(news)
summary_table <- data.frame(data = c('twitter','blogs','news'),
lines = c(length(blogs),length(news),length(twitter)),
words = c(sum(words_blogs),sum(words_news),sum(words_twitter)))
summary_table## data lines words
## 1 twitter 899288 38916603
## 2 blogs 77258 2711940
## 3 news 2360148 30249959
Based on this result, we see that it’s hard to perform analysis using the whole dataset due to restricted local resource. We can instead randomly select the samples as a representative for each dataset.
Set data in chunks to perform random selection.
twitter_chunks <- list()
for (i in seq(1, 1888)) {
twitter_chunks[[i]] <- twitter[(i*1250 - 1249):(i*1250)]
}
blogs_chunks <- list()
for (i in seq(1, 944)) {
blogs_chunks[[i]] <- blogs[(i*952 - 951):(i*952)]
}
news_chunks <- list()
for (i in seq(1, 944)) {
news_chunks[[i]] <- news[(i*81 - 80):(i*81)]
}Let’s select 30 chunks randomly from the data
set.seed(100)
samples <- sample(c(sample(twitter_chunks,944), blogs_chunks, news_chunks), 30)
data <- unlist(samples)Now, we need to clean the data to make it ready for analytics using tm package
corpus <- Corpus(VectorSource(data))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, content_transformer(function(x) {
gsub('\\S{25,} ', '', x)
}))
profaneDoc <- file('Terms-to-Block1.csv','r')
profane <- readLines(profaneDoc, n = 723)
close(profaneDoc)
corpus <- tm_map(corpus, removeWords, profane)
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, from = 'latin1', to = 'ASCII', sub = '')))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
# stemming a document is required for machine to understand the text, yet it should not be used when we perform a word cloud analysis. We decide to delay the stemming processing until we build the predictive function
corpus <- tm_map(corpus, stripWhitespace)Let’s perform analysis of word distribution by using word cloud
wordcloud(corpus, max.words = 100, scale = c(4, .25), rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))just, like, one, will are the 4 biggest words used in our data.
We’ll create ngrams function for tokenizing our words
ngramTokenizer <- list()
for(i in 1:5) {
ngramTokenizer[[i]] <- function(x) NGramTokenizer(x, Weka_control(min = i, max = i))
}We’ll create a term document matrix for each nth gram model
tdm <- list()
stemmed_corpus <- tm_map(corpus,stemDocument)
for(i in 1:5){
tdm[[i]] <- TermDocumentMatrix(stemmed_corpus, control = list(tokenize = ngramTokenizer[[i]]))
}Let’s create a graph for the unigrams, bigrams and trigrams
fred <- list()
fred_sum <- list()
dense_ngram_terms <- list()
for(i in 1:3) {
fred[[i]] <- removeSparseTerms(tdm[[i]], .9999)
fred_sum[[i]] <- head(sort(row_sums(fred[[i]]), decreasing = TRUE), 10)
data_frame <- data.frame(term = names(fred_sum[[i]]), frequency = fred_sum[[i]])
print(ggplot(data_frame, aes(x = term, y = frequency, fill = frequency)) +
labs(y="Frequency", title=(paste0("Most common terms of ",i,"-grams model"))) +
geom_bar(stat = 'identity') +
coord_flip())
}