complete_twitter <- readLines("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
complete_blogs <- readLines("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
complete_news <- readLines("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
stats <- function(text_data) {
no_of_lines <- length(text_data)
tokens <- unlist(strsplit(text_data, "([[:punct:]]|\\s)+"))
no_of_words <- length(tokens)
no_of_unique_words <- length(unique(tokens))
token_count <- sort(table(tokens), decreasing = TRUE)
most_freq_tokens <- paste0(names(token_count[1:5]), collapse = ", ")
data.frame(
`No of Lines` = no_of_lines,
`No of Words` = no_of_words,
`No of Unique Words` = no_of_unique_words,
`Most Freq Tokens` = most_freq_tokens
)
}
stats_all <- rbind(
stats(complete_news),
stats(complete_blogs),
stats(complete_twitter)
)
rownames(stats_all) <- c('news', 'blogs', 'twitter')
print(xtable(stats_all), type = "html")
| No.of.Lines | No.of.Words | No.of.Unique.Words | Most.Freq.Tokens | |
|---|---|---|---|---|
| news | 77259 | 2752773 | 94920 | the, to, and, a, of |
| blogs | 899288 | 38126070 | 442693 | the, to, and, of, a |
| 2360148 | 31193245 | 460207 | the, I, to, a, you |
Read in sampled text files (~ 1% of original files)
sample_twitter <- readLines("./Coursera-SwiftKey/sample/en_US.sample_twitter.txt")
sample_blogs <- readLines("./Coursera-SwiftKey/sample/en_US.sample_blogs.txt")
sample_news <- readLines("./Coursera-SwiftKey/sample/en_US.sample_news.txt")
Word frequencies follow zipf distribution. As we can see from the plots of all the three types of documents, the log-log histogram of word frequencies follow a linear relationship.
frequency_dist <- function(sample, n=1) {
nGramFreq <- sample %>%
NGramTokenizer(control=Weka_control(min=n, max=n)) %>%
tolower %>% table %>% as.data.frame.table
}
ngram_hist <- function(nGramFreq, title) {
nGramFreq %>%
ggplot(aes(x=Freq)) +
geom_histogram(bins=200, color='#586f0a', fill='orange') +
scale_x_log10() + scale_y_log10() +
annotation_logticks() +
theme_bw() + ggtitle(title)
}
samples <- list(
twitter = sample_twitter,
blogs = sample_blogs,
news = sample_news
)
ngrams <- list(word = 1, biGram = 2, triGram = 3)
plots <- imap(samples, function(data, name) {
imap(ngrams, function(n, ngram) {
frequency_dist(
sample = data,
n = n
) %>%
ngram_hist(
title = sprintf('%s %s Frequency Distribution', name, ngram)
)
})
}) %>% unlist(recursive=FALSE)
do.call(multiplot, c(cols=3, plots))
cum_prob <- function(sample) {
sample %>%
frequency_dist %>%
arrange(desc(Freq)) %>%
mutate(cumProb = cumsum(Freq) / sum(Freq))
}
qs <- list(0.5, 0.9)
samples <- list(
twitter = sample_twitter,
news = sample_news,
blogs = sample_blogs
)
tb <- samples %>%
map_df(function(sample) {
map_int(qs, function(q) {
sample %>%
cum_prob %>%
pull(cumProb) %>%
{which.max(. > q)}
})
}) %>% `row.names<-`(c('50%', '90%'))
print(xtable(tb), type='html')
| news | blogs | ||
|---|---|---|---|
| 50% | 115 | 177 | 111 |
| 90% | 5235 | 4105 | 6543 |
In twitter sample, 115 unique words are needed to cover 50% of the word instances, and 5235 unique words are needed to cover 90% of word instances.
For news, 50%: 177 and 90%: 4105 respectively.
Blogs, 50%: 111 and 90%: 6543.
stem_cum_prob <- function(sample) {
sample %>%
stemDocument %>%
frequency_dist %>%
arrange(desc(Freq)) %>%
mutate(cumProb = cumsum(Freq) / sum(Freq))
}
qs <- list(0.5, 0.9)
samples <- list(
twitter = sample_twitter,
news = sample_news,
blogs = sample_blogs
)
tb <- samples %>%
map_df(function(sample) {
map_int(qs, function(q) {
sample %>%
stem_cum_prob %>%
pull(cumProb) %>%
{which.max(. > q)}
})
}) %>% `row.names<-`(c('50%', '90%'))
print(xtable(tb), type='html')
| news | blogs | ||
|---|---|---|---|
| 50% | 107 | 166 | 104 |
| 90% | 4352 | 3432 | 4762 |
We can increase the words coverage by stemming the document:
In twitter sample, 107 unique words are needed to cover 50% of the word instances, and 4352 unique words are needed to cover 90% of word instances.
For news, 50%: 166 and 90%: 3432 respectively.
Blogs, 50%: 104 and 90%: 4762.
Number of smily faces in twitter: sum(str_count(sample_twitter, ':-\\)|:\\)')) = 1071; Number of sad faces in twitter: sum(str_count(sample_twitter, ':-\\(|:\\(')) =173;