Text mining exploratory report

Load the necessary library, however not all these library will be used in this analysis.

require(ggplot2)
require(data.table)
require(stringr)
require(magrittr)
require(quanteda)
require(cowplot)

Executive Summary

The objective of this exercise is to perform an exploratory analysis on the text source that will be used to develop the text typing prediction algorithm. The prediction algorithm shall predict the next word as the user is typing and hence enhance the typing speed. The text source data are collected from tweets, blogs and news. Through this exercise, the distribution and relationship of the text will be studied.

Data loading and cleaning

Our analysis will analyse 3 text files coming different sources, which are tweets, blogs and news. The text files are downloaded from the link provided by Coursera and being loaded into RDS files for rapid access purpose.

# /Users/Daniel/COURSERA DATA SCIENCE/capstone project
tweets = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/tweets.rds')
blogs = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/blogs.rds')
news = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/news.rds')

Basic count analysis

In the basic count analysis, we will use the count statistics to explore the text files and visualise the result.

Line and word count

Compute the line and word count in each corpus (text file).

text_list = list(tweets, blogs, news)

# line count helper function
f_linecount = function(v_text){
  return(length(v_text))
}

# word count helper function
f_wordcount = function(v_text){
  return(str_count(v_text, '[:word:]') %>% sum)
}

df_basic = data.frame(source = c('tweets','blogs','news'), line_count = NA, word_count = NA)

df_basic$line_count = sapply(text_list, f_linecount)
df_basic$word_count = sapply(text_list, f_wordcount)

The plots in below show the result of the count statistcs of the line and word.

# melt the data table so that we can facet the bar plot by line count and word count
df_plot_basic = melt(df_basic, measure.vars = c('line_count','word_count'), value.name = 'count',variable.name = 'by')

(ggplot(data = df_plot_basic) + geom_bar(aes(x = source, y = count/1E+06), stat = 'identity') + facet_wrap(~by, scales = 'free') +
    ylab('Count (in millions)'))

Word frequency analysis

In this section we will explore the top N most frequent word within each corpus. There will be some data cleaning and transformation before top N analysis would take place.

Data cleaning and transformation

As the word analysis is quite computation extensive given the scale of the data source, we will perform a simple sampling (20% of the population) and run the analysis on the sample instead.

# helper function to perform sampling on the corpus, @v_data = character; @percentage = float
f_sampling = function(v_data, percentage = .2){
  return(sample(seq_along(v_data), size = round(length(blogs)*percentage)))
}

blogs_sample = blogs[f_sampling(blogs)]

news_sample = news[f_sampling(news)]

tweets_sample = tweets[f_sampling(tweets)]

# collapse the word vector before creating the corpus object
blogs_sample = paste(blogs_sample, collapse = ' ')
news_sample = paste(news_sample, collapse = ' ')
tweets_sample = paste(tweets_sample, collapse = ' ')

rm(list=c('blogs','tweets','news'))

The transformation rules of the text source are as follows:

all alphabet will be converted to lower case
all punctuation will be removed
alias handle and hashtag words will be remove (for tweets)
url will be removed
stopwords such as ‘is’, ‘a’, ‘the’, ‘of’ and etc will be removed

f_removeurl = function(x){
  gsub(pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',replacement = '', x, perl = T)
}

f_removehashtag = function(x){
  gsub(pattern = '(#[a-zA-Z0-9]+)+', replacement = '', x)
}

f_removealias = function(x){
  gsub(pattern = '@([a-zA-Z0-9]+)+', replacement = '', x, perl = T)
}

f_removenumber = function(x){
  gsub(pattern = '[0-9]+', replacement = '', x)
}

# wrapper function to transform the text in one go
f_transformation = function(source, source_name){
  mycorpus = corpus(source,docnames = source_name)
  
  texts(mycorpus) = tolower(texts(mycorpus))
  texts(mycorpus) = f_removeurl(texts(mycorpus))
  texts(mycorpus) = f_removenumber(texts(mycorpus))
  return(mycorpus)
} 

# create term document data table from dfm, @mydfm = dfm, @source_name = character
f_dfm_to_dt = function(mydfm, source_name){
  mydt = t(as.matrix(mydfm)) %>% data.table(keep.rownames = T)
  setnames(mydt, colnames(mydt), c('term', 'frequency'))
  mydt[, doc:=source_name]
  setorder(mydt, -frequency)
  return(mydt)
}

blogs_corpus = f_transformation(blogs_sample, 'blogs')
blogs_dfm = dfm(x = blogs_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
blogs_dt = f_dfm_to_dt(blogs_dfm, 'blogs')

news_corpus = f_transformation(news_sample, 'news')
news_dfm = dfm(x = news_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
news_dt = f_dfm_to_dt(news_dfm, 'news')

tweets_corpus = f_transformation(tweets_sample, 'tweets')
texts(tweets_corpus) = f_removehashtag(texts(tweets_corpus))
texts(tweets_corpus) = f_removealias(texts(tweets_corpus))
tweets_dfm = dfm(x = tweets_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
tweets_dt = f_dfm_to_dt(tweets_dfm, 'tweets')

Visualize the results

topN = 20

# identify the terms that are common between the blogs, news and tweets sample corpus
intersect_term = Reduce(intersect, list(blogs_dt[1:topN, term], news_dt[1:topN, term], tweets_dt[1:topN, term]))

# assign a flag to identify whether a term appears in all 3 corpus
blogs_dt[1:topN, .(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))][]

##       term frequency common
##  1:    one     24984      Y
##  2:   will     22250      Y
##  3:   just     20197      Y
##  4:   like     19783      Y
##  5:    can     19658      Y
##  6:   time     17973      Y
##  7:    get     14255      Y
##  8:    now     12109      Y
##  9: people     11990      N
## 10:   know     11956      N
## 11:   also     11020      N
## 12:    new     10836      Y
## 13:   even     10460      N
## 14: really     10220      N
## 15:  first     10208      N
## 16:    day     10172      N
## 17:   make     10163      N
## 18:     us     10086      N
## 19:   back      9942      N
## 20:    see      9894      N

blogs_plot = (
  ggplot(data = blogs_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))]) 
  + geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common), 
    stat = 'identity') + coord_flip() + theme_bw() + labs(title='Blogs word frequency plot')
    + xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)

news_plot = (
  ggplot(data = news_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))]) 
  + geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common), 
    stat = 'identity') + coord_flip() + theme_bw() + labs(title='News word frequency plot')
    + xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)

tweets_plot = (
  ggplot(data = tweets_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))]) 
  + geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common), 
    stat = 'identity') + coord_flip() + theme_bw() + labs(title='Tweets word frequency plot')
    + xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)

plot_grid(blogs_plot, news_plot, tweets_plot, nrow = 1)

From the word frequency plot in above, we see that the top 20 words in each text source are words that are very typical. Almost 50% of the top 20 words in each corpus appear in other corpus as well. We also notice that there the word distribution in the blogs corpus is quite discerning as almost all of them are concentrated in the top 10 words. This might suggest we can leverage on this pattern to reduce the training data that we need.

Prediction Algorithm Strategy

The goal of the prediction is to predict the next word that the user is going to type, it is natural to use what the user had type (most recent) to predict the next word. As we think along these lines, we will need to decide how many typed words are needed in the prediction. This will require n-gram tokenization in which we will tokenize n adjacent terms and rank it by frequency.

On high level view, the flow of the prediction process is as followed:

Tokenize the text using n-gram tokenization (n = 2)
Separate the n-gram into pairs by splitting the last word from the n-gram. For a bi-gram “be doing”, it will be splitted into “be” (a uni-gram) and “doing”. For a tri-gram “will be doing”, it will be splitted into “will be” (a bi-gram) and “doing”. Each split will produce a paired (n-1)-gram and last word.
Compute the frequency of each paired words and store them into a dictionary with (n-1)-gram as the key and the last word as the value.
Repeat step 1 to step 3 for n = 3 and n = 4.
During the next word prediction, the most recent n typed words will be splitted into (n-1)-gram, (n-2)-gram and (n-3)-gram. These n-grams will then be matched against the dictionary and the top 3 last words (ranked by frequency) will be selected as predicted candidates for the next word.

Note that the stopword should not be removed in the algorithm design as the prediction is to enhance user experience in natural language typing.

Below is an example of bi-gram tokenization on the blogs corpus.

blogs_bigram = tokens_ngrams(x = tokens(blogs_corpus, remove_punct = TRUE, remove_symbols = TRUE), n = 2, concatenator = ' ')

blogs_bigram_dt = data.table(ngram = blogs_bigram$blogs, freq = 1)

blogs_bigram_dt[, .(freq = sum(freq)), ngram] %>% setorder(-freq) %>% head()

##      ngram  freq
## 1:  of the 37167
## 2:  in the 30851
## 3:  to the 16877
## 4:  on the 14878
## 5:   to be 13818
## 6: and the 11793