Overall Stats

complete_twitter <- readLines("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
complete_blogs <- readLines("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
complete_news <- readLines("./Coursera-SwiftKey/final/en_US/en_US.news.txt")
stats <- function(text_data) {
    no_of_lines <- length(text_data)
    
    tokens <- unlist(strsplit(text_data, "([[:punct:]]|\\s)+"))
    no_of_words <- length(tokens)

    no_of_unique_words <- length(unique(tokens))
    
    token_count <- sort(table(tokens), decreasing = TRUE)
    most_freq_tokens <- paste0(names(token_count[1:5]), collapse = ", ")
    
    data.frame(
        `No of Lines` = no_of_lines, 
        `No of Words` = no_of_words, 
        `No of Unique Words` = no_of_unique_words, 
        `Most Freq Tokens` = most_freq_tokens
    )
}

stats_all <- rbind(
    stats(complete_news),
    stats(complete_blogs),
    stats(complete_twitter)
)

rownames(stats_all) <- c('news', 'blogs', 'twitter')
print(xtable(stats_all), type = "html")
No.of.Lines No.of.Words No.of.Unique.Words Most.Freq.Tokens
news 77259 2752773 94920 the, to, and, a, of
blogs 899288 38126070 442693 the, to, and, of, a
twitter 2360148 31193245 460207 the, I, to, a, you


Read in sampled text files (~ 1% of original files)

sample_twitter <- readLines("./Coursera-SwiftKey/sample/en_US.sample_twitter.txt") 
sample_blogs <- readLines("./Coursera-SwiftKey/sample/en_US.sample_blogs.txt") 
sample_news <- readLines("./Coursera-SwiftKey/sample/en_US.sample_news.txt")

Word / Bi-Gram / Tri-Gram frequencies Distribution

Word frequencies follow zipf distribution. As we can see from the plots of all the three types of documents, the log-log histogram of word frequencies follow a linear relationship.

frequency_dist <- function(sample, n=1) {
    nGramFreq <- sample %>% 
        NGramTokenizer(control=Weka_control(min=n, max=n)) %>%
        tolower %>% table %>% as.data.frame.table
}    

ngram_hist <- function(nGramFreq, title) {    
    nGramFreq %>%
        ggplot(aes(x=Freq)) + 
        geom_histogram(bins=200, color='#586f0a', fill='orange') +
        scale_x_log10() + scale_y_log10() + 
        annotation_logticks() + 
        theme_bw() + ggtitle(title)
}
samples <- list(
    twitter = sample_twitter, 
    blogs = sample_blogs, 
    news = sample_news
)

ngrams <- list(word = 1, biGram = 2, triGram = 3)

plots <- imap(samples, function(data, name) {
    imap(ngrams, function(n, ngram) {
        frequency_dist(
            sample = data,
            n = n
        ) %>%
            ngram_hist(
                title = sprintf('%s %s Frequency Distribution', name, ngram)
            )
    })
}) %>% unlist(recursive=FALSE)
do.call(multiplot, c(cols=3, plots))

Unique Words Coverage

cum_prob <- function(sample) {
    sample %>% 
        frequency_dist %>%
        arrange(desc(Freq)) %>%
        mutate(cumProb = cumsum(Freq) / sum(Freq))
}

qs <- list(0.5, 0.9)
samples <- list(
    twitter = sample_twitter, 
    news = sample_news, 
    blogs = sample_blogs
    )

tb <- samples %>%
    map_df(function(sample) {
        map_int(qs, function(q) {
            sample %>% 
                cum_prob %>%
                pull(cumProb) %>%
                {which.max(. > q)}
        })
    }) %>% `row.names<-`(c('50%', '90%'))

print(xtable(tb), type='html')
twitter news blogs
50% 115 177 111
90% 5235 4105 6543


In twitter sample, 115 unique words are needed to cover 50% of the word instances, and 5235 unique words are needed to cover 90% of word instances.

For news, 50%: 177 and 90%: 4105 respectively.

Blogs, 50%: 111 and 90%: 6543.

stem_cum_prob <- function(sample) {
    sample %>% 
        stemDocument %>%
        frequency_dist %>%
        arrange(desc(Freq)) %>%
        mutate(cumProb = cumsum(Freq) / sum(Freq))
}

qs <- list(0.5, 0.9)
samples <- list(
    twitter = sample_twitter, 
    news = sample_news, 
    blogs = sample_blogs
    )

tb <- samples %>%
    map_df(function(sample) {
        map_int(qs, function(q) {
            sample %>% 
                stem_cum_prob %>%
                pull(cumProb) %>%
                {which.max(. > q)}
        })
    }) %>% `row.names<-`(c('50%', '90%'))

print(xtable(tb), type='html')
twitter news blogs
50% 107 166 104
90% 4352 3432 4762


We can increase the words coverage by stemming the document:

In twitter sample, 107 unique words are needed to cover 50% of the word instances, and 4352 unique words are needed to cover 90% of word instances.

For news, 50%: 166 and 90%: 3432 respectively.

Blogs, 50%: 104 and 90%: 4762.


Smily / Sad Face count

Number of smily faces in twitter: sum(str_count(sample_twitter, ':-\\)|:\\)')) = 1071; Number of sad faces in twitter: sum(str_count(sample_twitter, ':-\\(|:\\(')) =173;