Overview
This document contains the analysis for the Milestone Report of the Capstone Project of the Data Science course.
Exploratory Analysis
Let’s start with some simple word counts, first removing the stop words from the files. This is the one for the blogs text file.
Word count
# word count for blogs
word.count.blogs <- txt.blogs.smp.df %>%
unnest_tokens(word, text) %>%
# removes the stop words
filter(!(word %in% stop.words$word)) %>%
count(word, sort = TRUE)
word.count.blogs
This is the word count for the news file.
# word count for news
word.count.news <- txt.news.smp.df %>%
unnest_tokens(word, text) %>%
# removes the stop words
filter(!(word %in% stop.words$word)) %>%
count(word, sort = TRUE)
word.count.news
Finally, the word count for the Twitter file
# word count for twitter
word.count.twitter <- txt.twitter.smp.df %>%
unnest_tokens(word, text) %>%
filter(!(word %in% stop.words$word)) %>%
count(word, sort = TRUE)
word.count.twitter
Line count
Let’s execute some simple line count on the three files.
# Blogs data frame
print(paste("Blogs rows:", format(nrow(txt.blogs.df), decimal.mark=".",
big.mark=",", small.mark=".")))
[1] "Blogs rows: 899,288"
# News data frame
print(paste("News rows:", format(nrow(txt.news.df), decimal.mark=".",
big.mark=",", small.mark=".")))
[1] "News rows: 1,010,242"
# Twitter data frame
print(paste("Twitter rows:", format(nrow(txt.twitter.df), decimal.mark=".",
big.mark=",", small.mark=".")))
[1] "Twitter rows: 2,360,148"
Bigrams
Let’s now calculate the bigrams, or the two words that comes together in the text. We will start with the blogs txt file.
blogs.bigram <- txt.blogs.smp.df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop.words$word) %>%
filter(!word2 %in% stop.words$word) %>%
count(word1, word2, sort = TRUE)
blogs.bigram
Then the news txt file.
news.bigram <- txt.news.smp.df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop.words$word) %>%
filter(!word2 %in% stop.words$word) %>%
count(word1, word2, sort = TRUE)
news.bigram
And finally the Twitter file.
twitter.bigram <- txt.twitter.smp.df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop.words$word) %>%
filter(!word2 %in% stop.words$word) %>%
count(word1, word2, sort = TRUE)
twitter.bigram
Charts
We would like to provide some charts to explore a bit more the data frames that will be used to make a prediction algorithm
Line count
plot_ly(
x = c("Blogs", "News", "Twitter"),
y = c(nrow(txt.blogs.df), nrow(txt.news.df), nrow(txt.twitter.df)),
name = "Line count",
type = "bar"
) %>%
layout(title = "Line count",
xaxis = list(title = "data frame"),
yaxis = list(title = "lines"))
Word count
This chart shows the top 10 word found in the blogs txt file
plot_ly(
arrange(word.count.blogs, desc(n)) %>% top_n(10),
x = ~word,
y = ~n,
name = "Word count (top 10)",
type = "bar"
) %>%
layout(title = "Twitter word count (top 10)",
xaxis = list(title = "word"),
yaxis = list(title = "frequency"))
Selecting by n
This chart shows the top 10 word found in the news txt file
plot_ly(
arrange(word.count.news, desc(n)) %>% top_n(10),
x = ~word,
y = ~n,
name = "Word count (top 10)",
type = "bar"
) %>%
layout(title = "Twitter word count (top 10)",
xaxis = list(title = "word"),
yaxis = list(title = "frequency"))
Selecting by n
This chart shows the top 10 word found in the twitter txt file
plot_ly(
arrange(word.count.twitter, desc(n)) %>% top_n(10),
x = ~word,
y = ~n,
name = "Word count (top 10)",
type = "bar"
) %>%
layout(title = "Twitter word count (top 10)",
xaxis = list(title = "word"),
yaxis = list(title = "frequency"))
Selecting by n
LS0tCnRpdGxlOiAiTWlsZXN0b25lIFJlcG9ydCIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICBoaWdobGlnaHQ6IHRhbmdvCiAgICB0aGVtZTogY29zbW8KICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwotLS0KCiMjIE92ZXJ2aWV3CgpUaGlzIGRvY3VtZW50IGNvbnRhaW5zIHRoZSBhbmFseXNpcyBmb3IgdGhlIE1pbGVzdG9uZSBSZXBvcnQgb2YgdGhlIENhcHN0b25lIFByb2plY3Qgb2YgdGhlIERhdGEgU2NpZW5jZSBjb3Vyc2UuCgojIyBJbnB1dCBEYXRhCgpGaXJzdCBsZXQncyBzZXR1cCB0aGUgcHJvamVjdCBsb2FkaW5nIHRoZSBsaWJyYXJpZXMgbmVlZGVkIGFuZCBpbXBvcnQgYWxsIHRoZSBkYXRhIHVzZWQgaW4gdGhlIGFuYWx5c2lzLiBUaGVuIHdlIHdpbGwgc2FtcGxlIHRoZSBkYXRhc2V0cyB0byBoYXZlIGZhc3RlciByZXN1bHRzLgoKYGBge3IgbG9hZERhdGEsIGNhY2hlPVRSVUV9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KHRpZHl0ZXh0KQpsaWJyYXJ5KHRpZHlyKQoKIyMgUmF3IGRhdGEgZXh0cmFjdGlvbgojIyBjb25uZWN0aW9uIHRvIHRoZSBUd2l0dGVyIGZpbGUKY29uIDwtIGZpbGUoImRhdGEvZmluYWwvZW5fVVMvZW5fVVMuYmxvZ3MudHh0IiwgInIiKSAKCiMjIHJlYWRzIHRoZSBmaXJzdCBuIGxpbmVzCnR4dF9ibG9ncyA8LSByZWFkTGluZXMoY29uLCBza2lwTnVsID0gVFJVRSkKCiMjIENsb3NlIHRoZSBjb25uZWN0aW9uIG9uY2UgZG9uZQpjbG9zZShjb24sIHR5cGUgPSAiciIpCgojIyBjb25uZWN0aW9uIHRvIHRoZSBUd2l0dGVyIGZpbGUKY29uIDwtIGZpbGUoImRhdGEvZmluYWwvZW5fVVMvZW5fVVMubmV3cy50eHQiLCAiciIpIAoKIyMgcmVhZHMgdGhlIGZpcnN0IG4gbGluZXMKdHh0X25ld3MgPC0gcmVhZExpbmVzKGNvbiwgc2tpcE51bCA9IFRSVUUpCgojIyBDbG9zZSB0aGUgY29ubmVjdGlvbiBvbmNlIGRvbmUKY2xvc2UoY29uLCB0eXBlID0gInIiKQoKIyMgY29ubmVjdGlvbiB0byB0aGUgVHdpdHRlciBmaWxlCmNvbiA8LSBmaWxlKCJkYXRhL2ZpbmFsL2VuX1VTL2VuX1VTLnR3aXR0ZXIudHh0IiwgInIiKSAKCiMjIHJlYWRzIHRoZSBmaXJzdCBuIGxpbmVzCnR4dF90d2l0dGVyIDwtIHJlYWRMaW5lcyhjb24sIHNraXBOdWwgPSBUUlVFKQoKIyMgQ2xvc2UgdGhlIGNvbm5lY3Rpb24gb25jZSBkb25lCmNsb3NlKGNvbiwgdHlwZSA9ICJyIikKCiMjIFRyYW5zZm9ybXMgdGhlIHRleHQgZmlsZSBpbnRvIGEgZGF0YSBmcmFtZQojIGJsb2dzCnR4dC5ibG9ncy5kZiA8LSBkYXRhX2ZyYW1lKGxpbmUgPSAxOmxlbmd0aCh0eHRfYmxvZ3MpLCB0ZXh0ID0gdHh0X2Jsb2dzKQoKIyBuZXdzCnR4dC5uZXdzLmRmIDwtIGRhdGFfZnJhbWUobGluZSA9IDE6bGVuZ3RoKHR4dF9uZXdzKSwgdGV4dCA9IHR4dF9uZXdzKQoKIyB0d2l0dGVyCnR4dC50d2l0dGVyLmRmIDwtIGRhdGFfZnJhbWUobGluZSA9IDE6bGVuZ3RoKHR4dF90d2l0dGVyKSwgdGV4dCA9IHR4dF90d2l0dGVyKQoKIyMgQ3JlYXRlcyBhIDQwJSBzYW1wbGUgb2YgdGhlIGRhdGEgZnJhbWVzCiMgYmxvZ3MKdHh0LmJsb2dzLnNtcC5kZiA8LSB0eHQuYmxvZ3MuZGZbc2FtcGxlKG5yb3codHh0LmJsb2dzLmRmKSwgbnJvdyh0eHQuYmxvZ3MuZGYpIC8gMzApLCBdCgojIG5ld3MKdHh0Lm5ld3Muc21wLmRmIDwtIHR4dC5uZXdzLmRmW3NhbXBsZShucm93KHR4dC5uZXdzLmRmKSwgbnJvdyh0eHQubmV3cy5kZikgLyAzMCksIF0KCiMgdHdpdHRlcgp0eHQudHdpdHRlci5zbXAuZGYgPC0gdHh0LnR3aXR0ZXIuZGZbc2FtcGxlKG5yb3codHh0LnR3aXR0ZXIuZGYpLCBucm93KHR4dC50d2l0dGVyLmRmKSAvIDMwKSwgXQpgYGAKCgojIyBFeHBsb3JhdG9yeSBBbmFseXNpcwoKTGV0J3Mgc3RhcnQgd2l0aCBzb21lIHNpbXBsZSB3b3JkIGNvdW50cywgZmlyc3QgcmVtb3ZpbmcgdGhlIHN0b3Agd29yZHMgZnJvbSB0aGUgZmlsZXMuIFRoaXMgaXMgdGhlIG9uZSBmb3IgdGhlIGJsb2dzIHRleHQgZmlsZS4KCiMjIyBXb3JkIGNvdW50CgpgYGB7ciB3b3JkQ291bnRCbG9ncywgY2FjaGU9VFJVRX0KIyB3b3JkIGNvdW50IGZvciBibG9ncwp3b3JkLmNvdW50LmJsb2dzIDwtIHR4dC5ibG9ncy5zbXAuZGYgJT4lCiAgICAgICAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUgCiAgICAgICAgIyByZW1vdmVzIHRoZSBzdG9wIHdvcmRzCiAgICAgICAgZmlsdGVyKCEod29yZCAlaW4lIHN0b3Aud29yZHMkd29yZCkpICU+JSAKICAgICAgICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgCgp3b3JkLmNvdW50LmJsb2dzCmBgYAoKVGhpcyBpcyB0aGUgd29yZCBjb3VudCBmb3IgdGhlIG5ld3MgZmlsZS4KCmBgYHtyIHdvcmRDb3VudE5ld3MsIGNhY2hlPVRSVUV9CiMgd29yZCBjb3VudCBmb3IgbmV3cwp3b3JkLmNvdW50Lm5ld3MgPC0gdHh0Lm5ld3Muc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lIAogICAgICAgICMgcmVtb3ZlcyB0aGUgc3RvcCB3b3JkcwogICAgICAgIGZpbHRlcighKHdvcmQgJWluJSBzdG9wLndvcmRzJHdvcmQpKSAlPiUgCiAgICAgICAgY291bnQod29yZCwgc29ydCA9IFRSVUUpIAoKd29yZC5jb3VudC5uZXdzCmBgYAoKRmluYWxseSwgdGhlIHdvcmQgY291bnQgZm9yIHRoZSBUd2l0dGVyIGZpbGUKCmBgYHtyIHdvcmRDb3VudFR3aXR0ZXIsIGNhY2hlPVRSVUV9CiMgd29yZCBjb3VudCBmb3IgdHdpdHRlcgp3b3JkLmNvdW50LnR3aXR0ZXIgPC0gdHh0LnR3aXR0ZXIuc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lIAogICAgICAgIGZpbHRlcighKHdvcmQgJWluJSBzdG9wLndvcmRzJHdvcmQpKSAlPiUgCiAgICAgICAgY291bnQod29yZCwgc29ydCA9IFRSVUUpIAoKd29yZC5jb3VudC50d2l0dGVyCmBgYAoKIyMgTGluZSBjb3VudAoKTGV0J3MgZXhlY3V0ZSBzb21lIHNpbXBsZSBsaW5lIGNvdW50IG9uIHRoZSB0aHJlZSBmaWxlcy4KCmBgYHtyfQojIEJsb2dzIGRhdGEgZnJhbWUKcHJpbnQocGFzdGUoIkJsb2dzIHJvd3M6IiwgZm9ybWF0KG5yb3codHh0LmJsb2dzLmRmKSwgZGVjaW1hbC5tYXJrPSIuIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBiaWcubWFyaz0iLCIsIHNtYWxsLm1hcms9Ii4iKSkpCgojIE5ld3MgZGF0YSBmcmFtZQpwcmludChwYXN0ZSgiTmV3cyByb3dzOiIsIGZvcm1hdChucm93KHR4dC5uZXdzLmRmKSwgZGVjaW1hbC5tYXJrPSIuIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJpZy5tYXJrPSIsIiwgc21hbGwubWFyaz0iLiIpKSkKCiMgVHdpdHRlciBkYXRhIGZyYW1lCnByaW50KHBhc3RlKCJUd2l0dGVyIHJvd3M6IiwgZm9ybWF0KG5yb3codHh0LnR3aXR0ZXIuZGYpLCBkZWNpbWFsLm1hcms9Ii4iLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYmlnLm1hcms9IiwiLCBzbWFsbC5tYXJrPSIuIikpKQpgYGAKCiMjIEJpZ3JhbXMKCkxldCdzIG5vdyBjYWxjdWxhdGUgdGhlIGJpZ3JhbXMsIG9yIHRoZSB0d28gd29yZHMgdGhhdCBjb21lcyB0b2dldGhlciBpbiB0aGUgdGV4dC4gV2Ugd2lsbCBzdGFydCB3aXRoIHRoZSBibG9ncyB0eHQgZmlsZS4KCmBgYHtyIGJsb2dzU2FtcGxlQmlncmFtLCBjYWNoZT1UUlVFfQpibG9ncy5iaWdyYW0gPC0gdHh0LmJsb2dzLnNtcC5kZiAlPiUKICAgICAgICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICAgICAgICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lIAogICAgICAgIGZpbHRlcighd29yZDEgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JQogICAgICAgIGZpbHRlcighd29yZDIgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JSAKICAgICAgICBjb3VudCh3b3JkMSwgd29yZDIsIHNvcnQgPSBUUlVFKQoKYmxvZ3MuYmlncmFtCmBgYAoKVGhlbiB0aGUgbmV3cyB0eHQgZmlsZS4KCmBgYHtyIG5ld3NTYW1wbGVCaWdyYW0sIGNhY2hlPVRSVUV9Cm5ld3MuYmlncmFtIDwtIHR4dC5uZXdzLnNtcC5kZiAlPiUKICAgICAgICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICAgICAgICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lIAogICAgICAgIGZpbHRlcighd29yZDEgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JQogICAgICAgIGZpbHRlcighd29yZDIgJWluJSBzdG9wLndvcmRzJHdvcmQpICU+JSAKICAgICAgICBjb3VudCh3b3JkMSwgd29yZDIsIHNvcnQgPSBUUlVFKQoKbmV3cy5iaWdyYW0KYGBgCgpBbmQgZmluYWxseSB0aGUgVHdpdHRlciBmaWxlLgoKYGBge3IgdHdpdHRlclNhbXBsZUJpZ3JhbSwgY2FjaGU9VFJVRX0KdHdpdHRlci5iaWdyYW0gPC0gdHh0LnR3aXR0ZXIuc21wLmRmICU+JQogICAgICAgIHVubmVzdF90b2tlbnMoYmlncmFtLCB0ZXh0LCB0b2tlbiA9ICJuZ3JhbXMiLCBuID0gMikgJT4lIAogICAgICAgIHNlcGFyYXRlKGJpZ3JhbSwgYygid29yZDEiLCAid29yZDIiKSwgc2VwID0gIiAiKSAlPiUgCiAgICAgICAgZmlsdGVyKCF3b3JkMSAlaW4lIHN0b3Aud29yZHMkd29yZCkgJT4lCiAgICAgICAgZmlsdGVyKCF3b3JkMiAlaW4lIHN0b3Aud29yZHMkd29yZCkgJT4lIAogICAgICAgIGNvdW50KHdvcmQxLCB3b3JkMiwgc29ydCA9IFRSVUUpCgp0d2l0dGVyLmJpZ3JhbQpgYGAKCgojIyBDaGFydHMKCldlIHdvdWxkIGxpa2UgdG8gcHJvdmlkZSBzb21lIGNoYXJ0cyB0byBleHBsb3JlIGEgYml0IG1vcmUgdGhlIGRhdGEgZnJhbWVzIHRoYXQgd2lsbCBiZSB1c2VkIHRvIG1ha2UgYSBwcmVkaWN0aW9uIGFsZ29yaXRobQoKIyMjIExpbmUgY291bnQKCmBgYHtyfQpwbG90X2x5KAogICAgICAgIHggPSBjKCJCbG9ncyIsICJOZXdzIiwgIlR3aXR0ZXIiKSwKICAgICAgICB5ID0gYyhucm93KHR4dC5ibG9ncy5kZiksIG5yb3codHh0Lm5ld3MuZGYpLCBucm93KHR4dC50d2l0dGVyLmRmKSksCiAgICAgICAgbmFtZSA9ICJMaW5lIGNvdW50IiwKICAgICAgICB0eXBlID0gImJhciIKKSAlPiUgCiAgICAgICAgbGF5b3V0KHRpdGxlID0gIkxpbmUgY291bnQiLAogICAgICAgICAgICAgICB4YXhpcyA9IGxpc3QodGl0bGUgPSAiZGF0YSBmcmFtZSIpLAogICAgICAgICAgICAgICB5YXhpcyA9IGxpc3QodGl0bGUgPSAibGluZXMiKSkKCmBgYAoKIyMjIFdvcmQgY291bnQKClRoaXMgY2hhcnQgc2hvd3MgdGhlIHRvcCAxMCB3b3JkIGZvdW5kIGluIHRoZSBibG9ncyB0eHQgZmlsZQoKYGBge3IgYmxvZ3NXb3JkQ291bnR9CnBsb3RfbHkoCiAgICAgICAgYXJyYW5nZSh3b3JkLmNvdW50LmJsb2dzLCBkZXNjKG4pKSAlPiUgdG9wX24oMTApLAogICAgICAgIHggPSB+d29yZCwKICAgICAgICB5ID0gfm4sCiAgICAgICAgbmFtZSA9ICJXb3JkIGNvdW50ICh0b3AgMTApIiwKICAgICAgICB0eXBlID0gImJhciIKKSAlPiUgCiAgICAgICAgbGF5b3V0KHRpdGxlID0gIlR3aXR0ZXIgd29yZCBjb3VudCAodG9wIDEwKSIsCiAgICAgICAgICAgICAgIHhheGlzID0gbGlzdCh0aXRsZSA9ICJ3b3JkIiksCiAgICAgICAgICAgICAgIHlheGlzID0gbGlzdCh0aXRsZSA9ICJmcmVxdWVuY3kiKSkKYGBgCgpUaGlzIGNoYXJ0IHNob3dzIHRoZSB0b3AgMTAgd29yZCBmb3VuZCBpbiB0aGUgbmV3cyB0eHQgZmlsZQoKYGBge3IgbmV3c1dvcmRDb3VudH0KcGxvdF9seSgKICAgICAgICBhcnJhbmdlKHdvcmQuY291bnQubmV3cywgZGVzYyhuKSkgJT4lIHRvcF9uKDEwKSwKICAgICAgICB4ID0gfndvcmQsCiAgICAgICAgeSA9IH5uLAogICAgICAgIG5hbWUgPSAiV29yZCBjb3VudCAodG9wIDEwKSIsCiAgICAgICAgdHlwZSA9ICJiYXIiCikgJT4lIAogICAgICAgIGxheW91dCh0aXRsZSA9ICJUd2l0dGVyIHdvcmQgY291bnQgKHRvcCAxMCkiLAogICAgICAgICAgICAgICB4YXhpcyA9IGxpc3QodGl0bGUgPSAid29yZCIpLAogICAgICAgICAgICAgICB5YXhpcyA9IGxpc3QodGl0bGUgPSAiZnJlcXVlbmN5IikpCmBgYAoKVGhpcyBjaGFydCBzaG93cyB0aGUgdG9wIDEwIHdvcmQgZm91bmQgaW4gdGhlIHR3aXR0ZXIgdHh0IGZpbGUKCmBgYHtyIHR3aXR0ZXJXb3JkQ291bnR9CnBsb3RfbHkoCiAgICAgICAgYXJyYW5nZSh3b3JkLmNvdW50LnR3aXR0ZXIsIGRlc2MobikpICU+JSB0b3BfbigxMCksCiAgICAgICAgeCA9IH53b3JkLAogICAgICAgIHkgPSB+biwKICAgICAgICBuYW1lID0gIldvcmQgY291bnQgKHRvcCAxMCkiLAogICAgICAgIHR5cGUgPSAiYmFyIgopICU+JSAKICAgICAgICBsYXlvdXQodGl0bGUgPSAiVHdpdHRlciB3b3JkIGNvdW50ICh0b3AgMTApIiwKICAgICAgICAgICAgICAgeGF4aXMgPSBsaXN0KHRpdGxlID0gIndvcmQiKSwKICAgICAgICAgICAgICAgeWF4aXMgPSBsaXN0KHRpdGxlID0gImZyZXF1ZW5jeSIpKQpgYGAK