Load the necessary library, however not all these library will be used in this analysis.
require(ggplot2)
require(data.table)
require(stringr)
require(magrittr)
require(quanteda)
require(cowplot)
The objective of this exercise is to perform an exploratory analysis on the text source that will be used to develop the text typing prediction algorithm. The prediction algorithm shall predict the next word as the user is typing and hence enhance the typing speed. The text source data are collected from tweets, blogs and news. Through this exercise, the distribution and relationship of the text will be studied.
Our analysis will analyse 3 text files coming different sources, which are tweets, blogs and news. The text files are downloaded from the link provided by Coursera and being loaded into RDS files for rapid access purpose.
# /Users/Daniel/COURSERA DATA SCIENCE/capstone project
tweets = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/tweets.rds')
blogs = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/blogs.rds')
news = readRDS('/Users/Daniel/COURSERA DATA SCIENCE/capstone project/news.rds')
In the basic count analysis, we will use the count statistics to explore the text files and visualise the result.
Compute the line and word count in each corpus (text file).
text_list = list(tweets, blogs, news)
# line count helper function
f_linecount = function(v_text){
return(length(v_text))
}
# word count helper function
f_wordcount = function(v_text){
return(str_count(v_text, '[:word:]') %>% sum)
}
df_basic = data.frame(source = c('tweets','blogs','news'), line_count = NA, word_count = NA)
df_basic$line_count = sapply(text_list, f_linecount)
df_basic$word_count = sapply(text_list, f_wordcount)
The plots in below show the result of the count statistcs of the line and word.
# melt the data table so that we can facet the bar plot by line count and word count
df_plot_basic = melt(df_basic, measure.vars = c('line_count','word_count'), value.name = 'count',variable.name = 'by')
(ggplot(data = df_plot_basic) + geom_bar(aes(x = source, y = count/1E+06), stat = 'identity') + facet_wrap(~by, scales = 'free') +
ylab('Count (in millions)'))
In this section we will explore the top N most frequent word within each corpus. There will be some data cleaning and transformation before top N analysis would take place.
As the word analysis is quite computation extensive given the scale of the data source, we will perform a simple sampling (20% of the population) and run the analysis on the sample instead.
# helper function to perform sampling on the corpus, @v_data = character; @percentage = float
f_sampling = function(v_data, percentage = .2){
return(sample(seq_along(v_data), size = round(length(blogs)*percentage)))
}
blogs_sample = blogs[f_sampling(blogs)]
news_sample = news[f_sampling(news)]
tweets_sample = tweets[f_sampling(tweets)]
# collapse the word vector before creating the corpus object
blogs_sample = paste(blogs_sample, collapse = ' ')
news_sample = paste(news_sample, collapse = ' ')
tweets_sample = paste(tweets_sample, collapse = ' ')
rm(list=c('blogs','tweets','news'))
The transformation rules of the text source are as follows:
f_removeurl = function(x){
gsub(pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',replacement = '', x, perl = T)
}
f_removehashtag = function(x){
gsub(pattern = '(#[a-zA-Z0-9]+)+', replacement = '', x)
}
f_removealias = function(x){
gsub(pattern = '@([a-zA-Z0-9]+)+', replacement = '', x, perl = T)
}
f_removenumber = function(x){
gsub(pattern = '[0-9]+', replacement = '', x)
}
# wrapper function to transform the text in one go
f_transformation = function(source, source_name){
mycorpus = corpus(source,docnames = source_name)
texts(mycorpus) = tolower(texts(mycorpus))
texts(mycorpus) = f_removeurl(texts(mycorpus))
texts(mycorpus) = f_removenumber(texts(mycorpus))
return(mycorpus)
}
# create term document data table from dfm, @mydfm = dfm, @source_name = character
f_dfm_to_dt = function(mydfm, source_name){
mydt = t(as.matrix(mydfm)) %>% data.table(keep.rownames = T)
setnames(mydt, colnames(mydt), c('term', 'frequency'))
mydt[, doc:=source_name]
setorder(mydt, -frequency)
return(mydt)
}
blogs_corpus = f_transformation(blogs_sample, 'blogs')
blogs_dfm = dfm(x = blogs_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
blogs_dt = f_dfm_to_dt(blogs_dfm, 'blogs')
news_corpus = f_transformation(news_sample, 'news')
news_dfm = dfm(x = news_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
news_dt = f_dfm_to_dt(news_dfm, 'news')
tweets_corpus = f_transformation(tweets_sample, 'tweets')
texts(tweets_corpus) = f_removehashtag(texts(tweets_corpus))
texts(tweets_corpus) = f_removealias(texts(tweets_corpus))
tweets_dfm = dfm(x = tweets_corpus, remove = stopwords('english'), remove_symbols = TRUE, remove_punct = TRUE)
tweets_dt = f_dfm_to_dt(tweets_dfm, 'tweets')
topN = 20
# identify the terms that are common between the blogs, news and tweets sample corpus
intersect_term = Reduce(intersect, list(blogs_dt[1:topN, term], news_dt[1:topN, term], tweets_dt[1:topN, term]))
# assign a flag to identify whether a term appears in all 3 corpus
blogs_dt[1:topN, .(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))][]
## term frequency common
## 1: one 24984 Y
## 2: will 22250 Y
## 3: just 20197 Y
## 4: like 19783 Y
## 5: can 19658 Y
## 6: time 17973 Y
## 7: get 14255 Y
## 8: now 12109 Y
## 9: people 11990 N
## 10: know 11956 N
## 11: also 11020 N
## 12: new 10836 Y
## 13: even 10460 N
## 14: really 10220 N
## 15: first 10208 N
## 16: day 10172 N
## 17: make 10163 N
## 18: us 10086 N
## 19: back 9942 N
## 20: see 9894 N
blogs_plot = (
ggplot(data = blogs_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))])
+ geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common),
stat = 'identity') + coord_flip() + theme_bw() + labs(title='Blogs word frequency plot')
+ xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)
news_plot = (
ggplot(data = news_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))])
+ geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common),
stat = 'identity') + coord_flip() + theme_bw() + labs(title='News word frequency plot')
+ xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)
tweets_plot = (
ggplot(data = tweets_dt[1:topN,.(term,frequency, common = ifelse(term %in% intersect_term, 'Y','N'))])
+ geom_bar(aes(x = factor(term, levels = term[order(frequency)]), y = frequency, fill = common),
stat = 'identity') + coord_flip() + theme_bw() + labs(title='Tweets word frequency plot')
+ xlab('Term') + theme(plot.title = element_text(hjust = .5, vjust = .5), panel.grid.major.y = element_blank())
)
plot_grid(blogs_plot, news_plot, tweets_plot, nrow = 1)
From the word frequency plot in above, we see that the top 20 words in each text source are words that are very typical. Almost 50% of the top 20 words in each corpus appear in other corpus as well. We also notice that there the word distribution in the blogs corpus is quite discerning as almost all of them are concentrated in the top 10 words. This might suggest we can leverage on this pattern to reduce the training data that we need.
The goal of the prediction is to predict the next word that the user is going to type, it is natural to use what the user had type (most recent) to predict the next word. As we think along these lines, we will need to decide how many typed words are needed in the prediction. This will require n-gram tokenization in which we will tokenize n adjacent terms and rank it by frequency.
On high level view, the flow of the prediction process is as followed:
Note that the stopword should not be removed in the algorithm design as the prediction is to enhance user experience in natural language typing.
Below is an example of bi-gram tokenization on the blogs corpus.
blogs_bigram = tokens_ngrams(x = tokens(blogs_corpus, remove_punct = TRUE, remove_symbols = TRUE), n = 2, concatenator = ' ')
blogs_bigram_dt = data.table(ngram = blogs_bigram$blogs, freq = 1)
blogs_bigram_dt[, .(freq = sum(freq)), ngram] %>% setorder(-freq) %>% head()
## ngram freq
## 1: of the 37167
## 2: in the 30851
## 3: to the 16877
## 4: on the 14878
## 5: to be 13818
## 6: and the 11793