Milestone Report

Introduction

This capstone project involves using tm, Rweka and other R packages to analyze the text data obtained from Twitter, blogs and news to build an app that can predict the next word to input.

Perform Data Cleaning

Load necessary packages and data

Firstly, let’s load all the packages we’re going to use in this project.

library(tm)
library(wordcloud)
library(RWeka)
library(dplyr)
library(slam)
library(qdap)
library(stringi)
library(ggplot2)

Load the dataset

twitter_con <- file('en_US.twitter.txt', 'r')
news_con <- file('en_US.news.txt', 'r')
blogs_con <- file('en_US.blogs.txt', 'r')

twitter <- readLines(twitter_con, skipNul = T)
close(twitter_con)

news <- readLines(news_con, skipNul = T, n = 77258)
close(news_con)

blogs <- readLines(blogs_con, skipNul = T)
close(blogs_con)

Analyze 3 data files

Let’s make a summary of each data

words_twitter <- stri_count_words(twitter)
words_blogs <- stri_count_words(blogs)
words_news <- stri_count_words(news)

summary_table <- data.frame(data = c('twitter','blogs','news'),
                            lines = c(length(blogs),length(news),length(twitter)),
                            words = c(sum(words_blogs),sum(words_news),sum(words_twitter)))
summary_table

##      data   lines    words
## 1 twitter  899288 38916603
## 2   blogs   77258  2711940
## 3    news 2360148 30249959

Based on this result, we see that it’s hard to perform analysis using the whole dataset due to restricted local resource. We can instead randomly select the samples as a representative for each dataset.

Split data into chunks

Set data in chunks to perform random selection.

twitter_chunks <- list()
for (i in seq(1, 1888)) {
  twitter_chunks[[i]] <- twitter[(i*1250 - 1249):(i*1250)]
}

blogs_chunks <- list()
for (i in seq(1, 944)) {
  blogs_chunks[[i]] <- blogs[(i*952 - 951):(i*952)]
}

news_chunks <- list()
for (i in seq(1, 944)) {
  news_chunks[[i]] <- news[(i*81 - 80):(i*81)]
}

Random selection

Let’s select 30 chunks randomly from the data

set.seed(100)
samples <- sample(c(sample(twitter_chunks,944), blogs_chunks, news_chunks), 30)
data <- unlist(samples)

Text transformation

Now, we need to clean the data to make it ready for analytics using tm package

corpus <- Corpus(VectorSource(data))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, content_transformer(function(x) {
  gsub('\\S{25,} ', '', x)
}))
profaneDoc <- file('Terms-to-Block1.csv','r')
profane <- readLines(profaneDoc, n = 723)
close(profaneDoc)

corpus <- tm_map(corpus, removeWords, profane)
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, from = 'latin1', to = 'ASCII', sub = '')))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)

# stemming a document is required for machine to understand the text, yet it should not be used when we perform a word cloud analysis. We decide to delay the stemming processing until we build the predictive function 
corpus <- tm_map(corpus, stripWhitespace)

Text Analysis

Word distribution analysis

Let’s perform analysis of word distribution by using word cloud

  wordcloud(corpus, max.words = 100, scale = c(4, .25), rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))

just, like, one, will are the 4 biggest words used in our data.

Create ngrams tokenizer

We’ll create ngrams function for tokenizing our words

ngramTokenizer <- list()
for(i in 1:5) {
  ngramTokenizer[[i]] <- function(x) NGramTokenizer(x, Weka_control(min = i, max = i))
}

Create Term Document Matrix

We’ll create a term document matrix for each nth gram model

tdm <- list()
stemmed_corpus <- tm_map(corpus,stemDocument)
for(i in 1:5){
  tdm[[i]] <- TermDocumentMatrix(stemmed_corpus, control = list(tokenize = ngramTokenizer[[i]]))
}

Visualize word frequency

Let’s create a graph for the unigrams, bigrams and trigrams

fred <- list()
fred_sum <- list()
dense_ngram_terms <- list()
for(i in 1:3) {
  fred[[i]] <- removeSparseTerms(tdm[[i]], .9999)
  fred_sum[[i]] <- head(sort(row_sums(fred[[i]]), decreasing = TRUE), 10)
  data_frame <- data.frame(term = names(fred_sum[[i]]), frequency = fred_sum[[i]])
  print(ggplot(data_frame, aes(x = term, y = frequency, fill = frequency)) + 
          labs(y="Frequency", title=(paste0("Most common terms of ",i,"-grams model"))) +
            geom_bar(stat = 'identity') +
              coord_flip())
              
}