Overview

This document contains the analysis for the Milestone Report of the Capstone Project of the Data Science course.

Input Data

First let’s setup the project loading the libraries needed and import all the data used in the analysis. Then we will sample the datasets to have faster results.

library(dplyr)
library(plotly)
Loading required package: ggplot2

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout
library(tidytext)
library(tidyr)

Exploratory Analysis

Let’s start with some simple word counts, first removing the stop words from the files. This is the one for the blogs text file.

Word count

# word count for blogs
word.count.blogs <- txt.blogs.smp.df %>%
        unnest_tokens(word, text) %>% 
        # removes the stop words
        filter(!(word %in% stop.words$word)) %>% 
        count(word, sort = TRUE) 
word.count.blogs

This is the word count for the news file.

# word count for news
word.count.news <- txt.news.smp.df %>%
        unnest_tokens(word, text) %>% 
        # removes the stop words
        filter(!(word %in% stop.words$word)) %>% 
        count(word, sort = TRUE) 
word.count.news

Finally, the word count for the Twitter file

# word count for twitter
word.count.twitter <- txt.twitter.smp.df %>%
        unnest_tokens(word, text) %>% 
        filter(!(word %in% stop.words$word)) %>% 
        count(word, sort = TRUE) 
word.count.twitter

Line count

Let’s execute some simple line count on the three files.

# Blogs data frame
print(paste("Blogs rows:", format(nrow(txt.blogs.df), decimal.mark=".", 
                                  big.mark=",", small.mark=".")))
[1] "Blogs rows: 899,288"
# News data frame
print(paste("News rows:", format(nrow(txt.news.df), decimal.mark=".", 
                                 big.mark=",", small.mark=".")))
[1] "News rows: 1,010,242"
# Twitter data frame
print(paste("Twitter rows:", format(nrow(txt.twitter.df), decimal.mark=".", 
                                    big.mark=",", small.mark=".")))
[1] "Twitter rows: 2,360,148"

Bigrams

Let’s now calculate the bigrams, or the two words that comes together in the text. We will start with the blogs txt file.

blogs.bigram <- txt.blogs.smp.df %>%
        unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
        separate(bigram, c("word1", "word2"), sep = " ") %>% 
        filter(!word1 %in% stop.words$word) %>%
        filter(!word2 %in% stop.words$word) %>% 
        count(word1, word2, sort = TRUE)
blogs.bigram

Then the news txt file.

news.bigram <- txt.news.smp.df %>%
        unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
        separate(bigram, c("word1", "word2"), sep = " ") %>% 
        filter(!word1 %in% stop.words$word) %>%
        filter(!word2 %in% stop.words$word) %>% 
        count(word1, word2, sort = TRUE)
news.bigram

And finally the Twitter file.

twitter.bigram <- txt.twitter.smp.df %>%
        unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
        separate(bigram, c("word1", "word2"), sep = " ") %>% 
        filter(!word1 %in% stop.words$word) %>%
        filter(!word2 %in% stop.words$word) %>% 
        count(word1, word2, sort = TRUE)
twitter.bigram

Charts

We would like to provide some charts to explore a bit more the data frames that will be used to make a prediction algorithm

Line count

plot_ly(
        x = c("Blogs", "News", "Twitter"),
        y = c(nrow(txt.blogs.df), nrow(txt.news.df), nrow(txt.twitter.df)),
        name = "Line count",
        type = "bar"
) %>% 
        layout(title = "Line count",
               xaxis = list(title = "data frame"),
               yaxis = list(title = "lines"))

Word count

This chart shows the top 10 word found in the blogs txt file

plot_ly(
        arrange(word.count.blogs, desc(n)) %>% top_n(10),
        x = ~word,
        y = ~n,
        name = "Word count (top 10)",
        type = "bar"
) %>% 
        layout(title = "Twitter word count (top 10)",
               xaxis = list(title = "word"),
               yaxis = list(title = "frequency"))
Selecting by n

This chart shows the top 10 word found in the news txt file

plot_ly(
        arrange(word.count.news, desc(n)) %>% top_n(10),
        x = ~word,
        y = ~n,
        name = "Word count (top 10)",
        type = "bar"
) %>% 
        layout(title = "Twitter word count (top 10)",
               xaxis = list(title = "word"),
               yaxis = list(title = "frequency"))
Selecting by n

This chart shows the top 10 word found in the twitter txt file

plot_ly(
        arrange(word.count.twitter, desc(n)) %>% top_n(10),
        x = ~word,
        y = ~n,
        name = "Word count (top 10)",
        type = "bar"
) %>% 
        layout(title = "Twitter word count (top 10)",
               xaxis = list(title = "word"),
               yaxis = list(title = "frequency"))
Selecting by n
LS0tCnRpdGxlOiAiTWlsZXN0b25lIFJlcG9ydCIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICBoaWdobGlnaHQ6IHRhbmdvCiAgICB0aGVtZTogY29zbW8KICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwotLS0KCiMjIE92ZXJ2aWV3CgpUaGlzIGRvY3VtZW50IGNvbnRhaW5zIHRoZSBhbmFseXNpcyBmb3IgdGhlIE1pbGVzdG9uZSBSZXBvcnQgb2YgdGhlIENhcHN0b25lIFByb2plY3Qgb2YgdGhlIERhdGEgU2NpZW5jZSBjb3Vyc2UuCgojIyBJbnB1dCBEYXRhCgpGaXJzdCBsZXQncyBzZXR1cCB0aGUgcHJvamVjdCBsb2FkaW5nIHRoZSBsaWJyYXJpZXMgbmVlZGVkIGFuZCBpbXBvcnQgYWxsIHRoZSBkYXRhIHVzZWQgaW4gdGhlIGFuYWx5c2lzLiBUaGVuIHdlIHdpbGwgc2FtcGxlIHRoZSBkYXRhc2V0cyB0byBoYXZlIGZhc3RlciByZXN1bHRzLgoKYGBge3IgbG9hZERhdGEsIGNhY2hlPVRSVUV9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KHRpZHl0ZXh0KQpsaWJyYXJ5KHRpZHlyKQoKIyMgUmF3IGRhdGEgZXh0cmFjdGlvbgojIyBjb25uZWN0aW9uIHRvIHRoZSBUd2l0dGVyIGZpbGUKY29uIDwtIGZpbGUoImRhdGEvZmluYWwvZW5fVVMvZW5fVVMuYmxvZ3MudHh0IiwgInIiKSAKCiMjIHJlYWRzIHRoZSBmaXJzdCBuIGxpbmVzCnR4dF9ibG9ncyA8LSByZWFkTGluZXMoY29uLCBza2lwTnVsID0gVFJVRSkKCiMjIENsb3NlIHRoZSBjb25uZWN0aW9uIG9uY2UgZG9uZQpjbG9zZShjb24sIHR5cGUgPSAiciIpCgojIyBjb25uZWN0aW9uIHRvIHRoZSBUd2l0dGVyIGZpbGUKY29uIDwtIGZpbGUoImRhdGEvZmluYWwvZW5fVVMvZW5fVVMubmV3cy50eHQiLCAiciIpIAoKIyMgcmVhZHMgdGhlIGZpcnN0IG4gbGluZXMKdHh0X25ld3MgPC0gcmVhZExpbmVzKGNvbiwgc2tpcE51bCA9IFRSVUUpCgojIyBDbG9zZSB0aGUgY29ubmVjdGlvbiBvbmNlIGRvbmUKY2xvc2UoY29uLCB0eXBlID0gInIiKQoKIyMgY29ubmVjdGlvbiB0byB0aGUgVHdpdHRlciBmaWxlCmNvbiA8LSBmaWxlKCJkYXRhL2ZpbmFsL2VuX1VTL2VuX1VTLnR3aXR0ZXIudHh0IiwgInIiKSAKCiMjIHJlYWRzIHRoZSBmaXJzdCBuIGxpbmVzCnR4dF90d2l0dGVyIDwtIHJlYWRMaW5lcyhjb24sIHNraXBOdWwgPSBUUlVFKQoKIyMgQ2xvc2UgdGhlIGNvbm5lY3Rpb24gb25jZSBkb25lCmNsb3NlKGNvbiwgdHlwZSA9ICJyIikKCiMjIFRyYW5zZm9ybXMgdGhlIHRleHQgZmlsZSBpbnRvIGEgZGF0YSBmcmFtZQojIGJsb2dzCnR4dC5ibG9ncy5kZiA8LSBkYXRhX2ZyYW1lKGxpbmUgPSAxOmxlbmd0aCh0eHRfYmxvZ3MpLCB0ZXh0ID0gdHh0X2Jsb2dzKQoKIyBuZXdzCnR4dC5uZXdzLmRmIDwtIGRhdGFfZnJhbWUobGluZSA9IDE6bGVuZ3RoKHR4dF9uZXdzKSwgdGV4dCA9IHR4dF9uZXdzKQoKIyB0d2l0dGVyCnR4dC50d2l0dGVyLmRmIDwtIGRhdGFfZnJhbWUobGluZSA9IDE6bGVuZ3RoKHR4dF90d2l0dGVyKSwgdGV4dCA9IHR4dF90d2l0dGVyKQoKIyMgQ3JlYXRlcyBhIDQwJSBzYW1wbGUgb2YgdGhlIGRhdGEgZnJhbWVzCiMgYmxvZ3MKdHh0LmJsb2dzLnNtcC5kZiA8LSB0eHQuYmxvZ3MuZGZbc2FtcGxlKG5yb3codHh0LmJsb2dzLmRmKSwgbnJvdyh0eHQuYmxvZ3MuZGYpIC8gMzApLCBdCgojIG5ld3MKdHh0Lm5ld3Muc21wLmRmIDwtIHR4dC5uZXdzLmRmW3NhbXBsZShucm93KHR4dC5uZXdzLmRmKSwgbnJvdyh0eHQubmV3cy5kZikgLyAzMCksIF0KCiMgdHdpdHRlcgp0eHQudHdpdHRlci5zbXAuZGYgPC0gdHh0LnR3aXR0ZXIuZGZbc2FtcGxlKG5yb3codHh0LnR3aXR0ZXIuZGYpLCBucm93KHR4dC50d2l0dGVyLmRmKSAvIDMwKSwgXQpgYGAKCgojIyBFeHBsb3JhdG9yeSBBbmFseXNpcwoKTGV0J3Mgc3RhcnQgd2l0aCBzb21lIHNpbXBsZSB3b3JkIGNvdW50cywgZmlyc3QgcmVtb3ZpbmcgdGhlIHN0b3Agd29yZHMgZnJvbSB0aGUgZmlsZXMuIFRoaXMgaXMgdGhlIG9uZSBmb3IgdGhlIGJsb2dzIHRleHQgZmlsZS4KCiMjIyBXb3JkIGNvdW50CgpgYGB7ciB3b3JkQ291bnRCbG9ncywgY2FjaGU9VFJVRX0KIyB3b3JkIGNvdW50IGZvciBibG9ncwp3b3JkLmNvdW50LmJsb2dzIDwtIHR4dC5ibG9ncy5zbXAuZGYgJT4lCiAgICAgICAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUgCiAgICAgICAgIyByZW1vdmVzIHRoZSBzdG9wIHdvcmRzCiAgICAgICAgZmlsdGVyKCEod29yZCAlaW4lIHN0b3Aud29yZHMkd29yZCkpICU+JSAKICAgICAgICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgCgp3b3JkLmNvdW50LmJsb2dzCmBgYAoKVGhpcyBpcyB0aGUgd29yZCBjb3VudCBmb3IgdGhlIG5ld3MgZmlsZS4KCmBgYHtyIHdvcmRDb3VudE5ld3MsIGNhY2hlPVRSVUV9CiMgd29yZCBjb3VudCBmb3IgbmV3cwp3b3JkLmNvdW50Lm5ld3MgPC0gdHh0Lm5ld3Muc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lIAogICAgICAgICMgcmVtb3ZlcyB0aGUgc3RvcCB3b3JkcwogICAgICAgIGZpbHRlcighKHdvcmQgJWluJSBzdG9wLndvcmRzJHdvcmQpKSAlPiUgCiAgICAgICAgY291bnQod29yZCwgc29ydCA9IFRSVUUpIAoKd29yZC5jb3VudC5uZXdzCmBgYAoKRmluYWxseSwgdGhlIHdvcmQgY291bnQgZm9yIHRoZSBUd2l0dGVyIGZpbGUKCmBgYHtyIHdvcmRDb3VudFR3aXR0ZXIsIGNhY2hlPVRSVUV9CiMgd29yZCBjb3VudCBmb3IgdHdpdHRlcgp3b3JkLmNvdW50LnR3aXR0ZXIgPC0gdHh0LnR3aXR0ZXIuc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lIAogICAgICAgIGZpbHRlcighKHdvcmQgJWluJSBzdG9wLndvcmRzJHdvcmQpKSAlPiUgCiAgICAgICAgY291bnQod29yZCwgc29ydCA9IFRSVUUpIAoKd29yZC5jb3VudC50d2l0dGVyCmBgYAoKIyMgTGluZSBjb3VudAoKTGV0J3MgZXhlY3V0ZSBzb21lIHNpbXBsZSBsaW5lIGNvdW50IG9uIHRoZSB0aHJlZSBmaWxlcy4KCmBgYHtyfQojIEJsb2dzIGRhdGEgZnJhbWUKcHJpbnQocGFzdGUoIkJsb2dzIHJvd3M6IiwgZm9ybWF0KG5yb3codHh0LmJsb2dzLmRmKSwgZGVjaW1hbC5tYXJrPSIuIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBiaWcubWFyaz0iLCIsIHNtYWxsLm1hcms9Ii4iKSkpCgojIE5ld3MgZGF0YSBmcmFtZQpwcmludChwYXN0ZSgiTmV3cyByb3dzOiIsIGZvcm1hdChucm93KHR4dC5uZXdzLmRmKSwgZGVjaW1hbC5tYXJrPSIuIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJpZy5tYXJrPSIsIiwgc21hbGwubWFyaz0iLiIpKSkKCiMgVHdpdHRlciBkYXRhIGZyYW1lCnByaW50KHBhc3RlKCJUd2l0dGVyIHJvd3M6IiwgZm9ybWF0KG5yb3codHh0LnR3aXR0ZXIuZGYpLCBkZWNpbWFsLm1hcms9Ii4iLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYmlnLm1hcms9IiwiLCBzbWFsbC5tYXJrPSIuIikpKQpgYGAKCiMjIEJpZ3JhbXMKCkxldCdzIG5vdyBjYWxjdWxhdGUgdGhlIGJpZ3JhbXMsIG9yIHRoZSB0d28gd29yZHMgdGhhdCBjb21lcyB0b2dldGhlciBpbiB0aGUgdGV4dC4gV2Ugd2lsbCBzdGFydCB3aXRoIHRoZSBibG9ncyB0eHQgZmlsZS4KCmBgYHtyIGJsb2dzU2FtcGxlQmlncmFtLCBjYWNoZT1UUlVFfQpibG9ncy5iaWdyYW0gPC0gdHh0LmJsb2dzLnNtcC5kZiAlPiUKICAgICAgICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICAgICAgICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lIAogICAgICAgIGZpbHRlcighd29yZDEgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JQogICAgICAgIGZpbHRlcighd29yZDIgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JSAKICAgICAgICBjb3VudCh3b3JkMSwgd29yZDIsIHNvcnQgPSBUUlVFKQoKYmxvZ3MuYmlncmFtCmBgYAoKVGhlbiB0aGUgbmV3cyB0eHQgZmlsZS4KCmBgYHtyIG5ld3NTYW1wbGVCaWdyYW0sIGNhY2hlPVRSVUV9Cm5ld3MuYmlncmFtIDwtIHR4dC5uZXdzLnNtcC5kZiAlPiUKICAgICAgICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICAgICAgICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lIAogICAgICAgIGZpbHRlcighd29yZDEgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JQogICAgICAgIGZpbHRlcighd29yZDIgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JSAKICAgICAgICBjb3VudCh3b3JkMSwgd29yZDIsIHNvcnQgPSBUUlVFKQoKbmV3cy5iaWdyYW0KYGBgCgpBbmQgZmluYWxseSB0aGUgVHdpdHRlciBmaWxlLgoKYGBge3IgdHdpdHRlclNhbXBsZUJpZ3JhbSwgY2FjaGU9VFJVRX0KdHdpdHRlci5iaWdyYW0gPC0gdHh0LnR3aXR0ZXIuc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMoYmlncmFtLCB0ZXh0LCB0b2tlbiA9ICJuZ3JhbXMiLCBuID0gMikgJT4lIAogICAgICAgIHNlcGFyYXRlKGJpZ3JhbSwgYygid29yZDEiLCAid29yZDIiKSwgc2VwID0gIiAiKSAlPiUgCiAgICAgICAgZmlsdGVyKCF3b3JkMSAlaW4lIHN0b3Aud29yZHMkd29yZCkgJT4lCiAgICAgICAgZmlsdGVyKCF3b3JkMiAlaW4lIHN0b3Aud29yZHMkd29yZCkgJT4lIAogICAgICAgIGNvdW50KHdvcmQxLCB3b3JkMiwgc29ydCA9IFRSVUUpCgp0d2l0dGVyLmJpZ3JhbQpgYGAKCgojIyBDaGFydHMKCldlIHdvdWxkIGxpa2UgdG8gcHJvdmlkZSBzb21lIGNoYXJ0cyB0byBleHBsb3JlIGEgYml0IG1vcmUgdGhlIGRhdGEgZnJhbWVzIHRoYXQgd2lsbCBiZSB1c2VkIHRvIG1ha2UgYSBwcmVkaWN0aW9uIGFsZ29yaXRobQoKIyMjIExpbmUgY291bnQKCmBgYHtyfQpwbG90X2x5KAogICAgICAgIHggPSBjKCJCbG9ncyIsICJOZXdzIiwgIlR3aXR0ZXIiKSwKICAgICAgICB5ID0gYyhucm93KHR4dC5ibG9ncy5kZiksIG5yb3codHh0Lm5ld3MuZGYpLCBucm93KHR4dC50d2l0dGVyLmRmKSksCiAgICAgICAgbmFtZSA9ICJMaW5lIGNvdW50IiwKICAgICAgICB0eXBlID0gImJhciIKKSAlPiUgCiAgICAgICAgbGF5b3V0KHRpdGxlID0gIkxpbmUgY291bnQiLAogICAgICAgICAgICAgICB4YXhpcyA9IGxpc3QodGl0bGUgPSAiZGF0YSBmcmFtZSIpLAogICAgICAgICAgICAgICB5YXhpcyA9IGxpc3QodGl0bGUgPSAibGluZXMiKSkKCmBgYAoKIyMjIFdvcmQgY291bnQKClRoaXMgY2hhcnQgc2hvd3MgdGhlIHRvcCAxMCB3b3JkIGZvdW5kIGluIHRoZSBibG9ncyB0eHQgZmlsZQoKYGBge3IgYmxvZ3NXb3JkQ291bnR9CnBsb3RfbHkoCiAgICAgICAgYXJyYW5nZSh3b3JkLmNvdW50LmJsb2dzLCBkZXNjKG4pKSAlPiUgdG9wX24oMTApLAogICAgICAgIHggPSB+d29yZCwKICAgICAgICB5ID0gfm4sCiAgICAgICAgbmFtZSA9ICJXb3JkIGNvdW50ICh0b3AgMTApIiwKICAgICAgICB0eXBlID0gImJhciIKKSAlPiUgCiAgICAgICAgbGF5b3V0KHRpdGxlID0gIlR3aXR0ZXIgd29yZCBjb3VudCAodG9wIDEwKSIsCiAgICAgICAgICAgICAgIHhheGlzID0gbGlzdCh0aXRsZSA9ICJ3b3JkIiksCiAgICAgICAgICAgICAgIHlheGlzID0gbGlzdCh0aXRsZSA9ICJmcmVxdWVuY3kiKSkKYGBgCgpUaGlzIGNoYXJ0IHNob3dzIHRoZSB0b3AgMTAgd29yZCBmb3VuZCBpbiB0aGUgbmV3cyB0eHQgZmlsZQoKYGBge3IgbmV3c1dvcmRDb3VudH0KcGxvdF9seSgKICAgICAgICBhcnJhbmdlKHdvcmQuY291bnQubmV3cywgZGVzYyhuKSkgJT4lIHRvcF9uKDEwKSwKICAgICAgICB4ID0gfndvcmQsCiAgICAgICAgeSA9IH5uLAogICAgICAgIG5hbWUgPSAiV29yZCBjb3VudCAodG9wIDEwKSIsCiAgICAgICAgdHlwZSA9ICJiYXIiCikgJT4lIAogICAgICAgIGxheW91dCh0aXRsZSA9ICJUd2l0dGVyIHdvcmQgY291bnQgKHRvcCAxMCkiLAogICAgICAgICAgICAgICB4YXhpcyA9IGxpc3QodGl0bGUgPSAid29yZCIpLAogICAgICAgICAgICAgICB5YXhpcyA9IGxpc3QodGl0bGUgPSAiZnJlcXVlbmN5IikpCmBgYAoKVGhpcyBjaGFydCBzaG93cyB0aGUgdG9wIDEwIHdvcmQgZm91bmQgaW4gdGhlIHR3aXR0ZXIgdHh0IGZpbGUKCmBgYHtyIHR3aXR0ZXJXb3JkQ291bnR9CnBsb3RfbHkoCiAgICAgICAgYXJyYW5nZSh3b3JkLmNvdW50LnR3aXR0ZXIsIGRlc2MobikpICU+JSB0b3BfbigxMCksCiAgICAgICAgeCA9IH53b3JkLAogICAgICAgIHkgPSB+biwKICAgICAgICBuYW1lID0gIldvcmQgY291bnQgKHRvcCAxMCkiLAogICAgICAgIHR5cGUgPSAiYmFyIgopICU+JSAKICAgICAgICBsYXlvdXQodGl0bGUgPSAiVHdpdHRlciB3b3JkIGNvdW50ICh0b3AgMTApIiwKICAgICAgICAgICAgICAgeGF4aXMgPSBsaXN0KHRpdGxlID0gIndvcmQiKSwKICAgICAgICAgICAgICAgeWF4aXMgPSBsaXN0KHRpdGxlID0gImZyZXF1ZW5jeSIpKQpgYGAK