Coursera Capstone Milestone Report

This document explains the initial exploratory analysis undertaken for the English versions of the Twitter, News, and Blogs data sets. This markdown document will walk through the process to load and tokenize the data, as well as explore summary statistics for all three data files. In addition, this report will explore some histograms for frequency analysis and estimates for effective dictionary sizes. One thing that becomes apparent when viewing these histograms and frequency tables is that there are a great many stopwords in these tables (words like “the”, “and”, “to” etc. that do not add semantic value). A conscious decision was made to include these stopwords since word prediction without them seems to become very cryptic and the result can be difficult to understand. It is possible that a dual approach may be warranted that uses a model which includes stopwords alongside one in which they are removed. In any case, completely elminating them seems like a bad idea.

If the reader is more interested in the frequency tables and histograms, it is recommended that they skip to the end (over the developmental code).

Reading in the Data

The raw data will be read in from the text data files and then converted to ISO-8859-1 standard. This makes it easier to do things like frequency analysis and tokenizing.

timestamp()
## ##------ Wed Jan 22 16:17:45 2020 ------##
message("Reading in twitter data file...")
## Reading in twitter data file...
lines_twitter <- suppressWarnings(readLines("en_US.twitter.txt"))
lines_twitter <- iconv(lines_twitter, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf("  %d lines read in.", length(lines_twitter)))
##   2360148 lines read in.
timestamp()
## ##------ Wed Jan 22 16:17:54 2020 ------##
message("Reading in news data file...")
## Reading in news data file...
lines_news <- suppressWarnings(readLines("en_US.news.txt"))
lines_news <- iconv(lines_news, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf("  %d lines read in.", length(lines_news)))
##   1010242 lines read in.
timestamp()
## ##------ Wed Jan 22 16:18:00 2020 ------##
message("Reading in blogs data file...")
## Reading in blogs data file...
lines_blogs <- suppressWarnings(readLines("en_US.blogs.txt"))
lines_blogs <- iconv(lines_blogs, from = "Windows-1254", to = "ISO-8859-1")
message(sprintf("  %d lines read in.", length(lines_blogs)))
##   899288 lines read in.
message("  Done.")
##   Done.
timestamp()
## ##------ Wed Jan 22 16:18:07 2020 ------##

Defining Helper Functions

Defining a few functions that will help with parsing the data, to do things like removing whitespace, punctuation and numbers, as well as define functions that will tabulate ngrams.

# Creating tokenizer functions
ngramTokenizer   <- function(x, n) unlist(lapply(ngrams(words(x), n), paste, collapse = " "), use.names = FALSE)
BigramTokenizer  <- function(x)    ngramTokenizer(x, 2)
TrigramTokenizer <- function(x)    ngramTokenizer(x, 3)

# The tranform_corpus() function takes a corpus as an input, and creates a standardized output
# that can be used for tokenizing.
transform_corpus <- function(x) {
    out <- tm_map(x, content_transformer(stripWhitespace)) %>%
            tm_map(content_transformer(tolower)) %>%
            tm_map(content_transformer(removePunctuation)) %>%
            tm_map(content_transformer(stemDocument), language = "english") %>%
            tm_map(content_transformer(removeNumbers))
    return(out)
}

# The following functions make frequency tables out of an input document matrix

make_1gram <- function(x) {
    terms <- x$dimnames$Terms
    terms <- data.frame(terms, stringsAsFactors = FALSE)
    colnames(terms) <- c("lastTerm")
    freqs <- data.frame(table(x$i))
    terms$IDX  <- freqs$Var1
    terms$frequency <- freqs$Freq
    terms <- arrange(terms, desc(frequency))
    return(terms)
}

make_2gram <- function(x, trimlow = 0) {
    terms <- x$dimnames$Terms
    terms <- strsplit(terms, " ", fixed = TRUE)
    terms <- data.frame(matrix(unlist(terms), nrow = length(unlist(terms))/2, byrow = TRUE), stringsAsFactors = FALSE)
    colnames(terms) <- c("firstTerms","lastTerm")
    freqs <- data.frame(table(x$i))
    terms$IDX  <- freqs$Var1
    terms$frequency <- freqs$Freq
    terms <- arrange(terms, desc(frequency))
    terms <- filter(terms, frequency > trimlow)
    return(terms)
}

make_3gram <- function(x, trimlow = 0) {
    terms <- x$dimnames$Terms
    terms <- strsplit(terms, " ", fixed = TRUE)
    terms <- data.frame(matrix(unlist(terms), nrow = length(unlist(terms))/3, byrow = TRUE), stringsAsFactors = FALSE)
    terms <- mutate(terms, firstTerms = paste(X1,X2,sep = "_"), firstTerms2 = X2, lastTerm = X3)
    terms <- select(terms, firstTerms, firstTerms2, lastTerm)
    freqs <- data.frame(table(x$i))
    terms$IDX  <- freqs$Var1
    terms$frequency <- freqs$Freq
    
    terms <- arrange(terms, desc(frequency))
    terms <- filter(terms, frequency > trimlow)
    return(terms)
}

Parsing the Data

The data that was read in is just freeform text. Now separating the text into words, bigrams, and trigrams.

timestamp()
## ##------ Wed Jan 22 16:18:07 2020 ------##
vs_twitter <- VectorSource(lines_twitter)
vs_news    <- VectorSource(lines_news)
vs_blogs   <- VectorSource(lines_blogs)

ct_twitter_0 <- VCorpus(vs_twitter, readerControl = list(reader = readPlain, language = "en", load = TRUE))
ct_news_0    <- VCorpus(vs_news, readerControl = list(reader = readPlain, language = "en", load = TRUE))
ct_blogs_0   <- VCorpus(vs_blogs, readerControl = list(reader = readPlain, language = "en", load = TRUE))

ct_twitter   <- transform_corpus(ct_twitter_0)
ct_news      <- transform_corpus(ct_news_0)
ct_blogs     <- transform_corpus(ct_blogs_0)

timestamp()
## ##------ Wed Jan 22 16:35:29 2020 ------##
# parsing 1-grams (single words)
tdm_twitter_1 <- TermDocumentMatrix(ct_twitter)
tdm_news_1    <- TermDocumentMatrix(ct_news)
tdm_blogs_1   <- TermDocumentMatrix(ct_blogs)

timestamp()
## ##------ Wed Jan 22 16:48:28 2020 ------##
# parsing bigrams
tdm_twitter_2 <- TermDocumentMatrix(ct_twitter, control = list(tokenize = BigramTokenizer))
tdm_news_2    <- TermDocumentMatrix(ct_news, control = list(tokenize = BigramTokenizer))
tdm_blogs_2   <- TermDocumentMatrix(ct_blogs, control = list(tokenize = BigramTokenizer))

timestamp()
## ##------ Wed Jan 22 17:13:57 2020 ------##
# parsing trigrams
tdm_twitter_3 <- TermDocumentMatrix(ct_twitter, control = list(tokenize = TrigramTokenizer))
tdm_news_3    <- TermDocumentMatrix(ct_news, control = list(tokenize = TrigramTokenizer))
tdm_blogs_3   <- TermDocumentMatrix(ct_blogs, control = list(tokenize = TrigramTokenizer))

timestamp()
## ##------ Wed Jan 22 17:48:45 2020 ------##

Creating the Frequency Tables

Next, the parsed data can be visualized as either single words in isolation (1-grams), bigrams, or trigrams. This process can also give insight into how big the dictionary needs to be. In the simplest example, a word, bigram, or trigram with a frequence of one may not be useful at all as a predictor, so it may be able to be immediately discarded.

timestamp()
## ##------ Wed Jan 22 17:48:45 2020 ------##
# creating 1-gram frequency tables

freq_twitter_1 <- make_1gram(tdm_twitter_1)
freq_news_1    <- make_1gram(tdm_news_1)
freq_blogs_1   <- make_1gram(tdm_blogs_1)

# showing the 20 most frequent 1-grams for each file
table_1 <- cbind(freq_twitter_1[1:20,], freq_news_1[1:20,], freq_blogs_1[1:20,])
colnames(table_1) <- c("Twitter", "Index", "Freq", "News", "Index", "Freq", "Blogs", "Index", "Freq")
print(table_1[,c(1,3,4,6,7,9)])
##    Twitter   Freq  News Freq.1 Blogs Freq.2
## 1      the 704520   the 649843   the 354758
## 2      you 425145   and 458362   and 300686
## 3      and 376379   for 238726  that 161585
## 4      for 342123  that 237492   for 155095
## 5     that 245465  said 191214  with 127926
## 6     your 181841  with 179898  this 118706
## 7     have 168942   was 149771   you 107246
## 8     with 159768   but 118810  have 105713
## 9     this 151511  from 114477   was  98345
## 10     are 144110  have 108981   but  94516
## 11    just 141328   are 100175   are  84785
## 12     get 134899   his  98995   not  75977
## 13   thank 124490   has  90377  from  72795
## 14    what 119276  this  90015   all  72019
## 15    like 118671   not  84033   one  63946
## 16     but 117721   who  83117   out  56418
## 17     not 112785  year  82312 about  54154
## 18     all 111382  will  78388  they  53733
## 19    love 109909  they  75669  what  53396
## 20     out 108227 about  68621  like  53299
message(sprintf("For 1-grams, note that a total of all frequencies is essentially a word count: %d",
                sum(freq_twitter_1$frequency)+sum(freq_news_1$frequency)+sum(freq_blogs_1$frequency)))
## For 1-grams, note that a total of all frequencies is essentially a word count: 54227667
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
                dim(freq_twitter_1)[1]+dim(freq_news_1)[1]+dim(freq_blogs_1)[1]))
## Total size before pruning the elements with a frequency of one: 848144
# pruning
freq_twitter_1 <- freq_twitter_1[freq_twitter_1$frequency > 1,]
freq_news_1    <- freq_news_1[freq_news_1$frequency > 1,]
freq_blogs_1   <- freq_blogs_1[freq_blogs_1$frequency > 1,]

message(sprintf("Total size after pruning the elements with a frequency of one : %d",
                dim(freq_twitter_1)[1]+dim(freq_news_1)[1]+dim(freq_blogs_1)[1]))
## Total size after pruning the elements with a frequency of one : 311683
# creating bigram frequency tables

freq_twitter_2 <- make_2gram(tdm_twitter_2)
freq_news_2    <- make_2gram(tdm_news_2)
freq_blogs_2   <- make_2gram(tdm_blogs_2)

# showing the 20 most frequent 1-grams for each file
table_2 <- cbind(freq_twitter_2[1:20,], freq_news_2[1:20,], freq_blogs_2[1:20,])
colnames(table_2) <- c("Twt-1", "Twt-2", "Index", "Freq", "News-1", "News-2", "Index", "Freq",
                       "Blogs-1", "Blogs-2", "Index", "Freq")
print(table_2[,c(1,2,4,5,6,8,9,10,12)])
##    Twt-1 Twt-2  Freq News-1 News-2 Freq.1 Blogs-1 Blogs-2 Freq.2
## 1     in   the 73615     of    the 136756      of     the  81085
## 2    for   the 71221     in    the 133242      in     the  72485
## 3     of   the 53205     to    the  66733      to     the  43087
## 4     on   the 46149     on    the  57870      on     the  39536
## 5     go    to 45321    for    the  55678      to      be  34083
## 6     to    be 44348     at    the  46995     for     the  31490
## 7  thank   for 43193    and    the  42148     and     the  30571
## 8     to   the 41330     in      a  41598     and       i  28511
## 9   have     a 37209     to     be  36909       i    have  26883
## 10    at   the 35598   with    the  35386      at     the  25230
## 11     i  love 34006   from    the  30499      it     was  24895
## 12 thank   you 31923   with      a  28290      in       a  24573
## 13  want    to 31422     of      a  27011      is       a  24317
## 14    if   you 30786     he   said  26880       i     was  24256
## 15     i  have 29700     as      a  25372    with     the  23884
## 16   for     a 28333    for      a  25100      it      is  23716
## 17     i    am 27309   that    the  24759       i      am  23412
## 18     i  dont 27175     is      a  23254    from     the  20847
## 19    to   see 27105     by    the  23175    that       i  20364
## 20    to   get 26238   will     be  22047    want      to  20317
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
                dim(freq_twitter_2)[1]+dim(freq_news_2)[1]+dim(freq_blogs_2)[1]))
## Total size before pruning the elements with a frequency of one: 12390006
# pruning
freq_twitter_2 <- freq_twitter_2[freq_twitter_2$frequency > 1,]
freq_news_2    <- freq_news_2[freq_news_2$frequency > 1,]
freq_blogs_2   <- freq_blogs_2[freq_blogs_2$frequency > 1,]

message(sprintf("Total size after pruning the elements with a frequency of one : %d",
                dim(freq_twitter_2)[1]+dim(freq_news_2)[1]+dim(freq_blogs_2)[1]))
## Total size after pruning the elements with a frequency of one : 3789035
# creating trigram frequency tables

freq_twitter_3 <- make_3gram(tdm_twitter_3)
freq_news_3    <- make_3gram(tdm_news_3)
freq_blogs_3   <- make_3gram(tdm_blogs_3)

# showing the 20 most frequent 1-grams for each file
table_3 <- cbind(freq_twitter_3[1:20,], freq_news_3[1:20,], freq_blogs_3[1:20,])
colnames(table_3) <- c("Twt-1/2", "Twt-2", "Twt-3", "Index", "Freq", "News-1/2", "News-2", "News-3", "Index", "Freq",
                       "Blogs-1/2", "Blogs-2", "Blogs-3", "Index", "Freq")
print(table_3[,c(1,3,5,6,8,10,11,13,15)])
##         Twt-1/2  Twt-3  Freq  News-1/2 News-3 Freq.1 Blogs-1/2 Blogs-3 Freq.2
## 1     thank_for    the 23460    one_of    the  11901    one_of     the   7739
## 2  look_forward     to 11048     a_lot     of   8573     a_lot      of   6608
## 3     thank_you    for  8410   part_of    the   5615    i_want      to   5936
## 4     cant_wait     to  7967   as_well     as   5282    it_was       a   4178
## 5       for_the follow  7955   the_end     of   4768    be_abl      to   4068
## 6        i_want     to  7688 accord_to    the   4758   some_of     the   3972
## 7        i_love    you  7421    out_of    the   4706   a_coupl      of   3845
## 8         go_to     be  7139   some_of    the   4606   as_well      as   3800
## 9        have_a  great  6022     to_be      a   4442     to_be       a   3713
## 10        a_lot     of  5998    in_the  first   4416    out_of     the   3705
## 11        to_be      a  5827  the_unit  state   3840   the_end      of   3698
## 12       i_need     to  5792     go_to     be   3708    i_have    been   3437
## 13       to_see    you  5744    be_abl     to   3703    i_have       a   3321
## 14       i_have      a  5490 the_first   time   3496   this_is       a   3181
## 15        im_go     to  5414    it_was      a   3467    i_have      to   3100
## 16       one_of    the  5340 member_of    the   3352  the_rest      of   3081
## 17       i_have     to  4764    end_of    the   3133   part_of     the   3042
## 18        is_go     to  4497   said_in      a   2962    one_of      my   3032
## 19       i_dont   know  4438   for_the  first   2911     i_had      to   2916
## 20       let_me   know  4314    of_the   year   2891  there_is       a   2787
message(sprintf("Total size before pruning the elements with a frequency of one: %d",
                dim(freq_twitter_3)[1]+dim(freq_news_3)[1]+dim(freq_blogs_3)[1]))
## Total size before pruning the elements with a frequency of one: 37952155
# pruning
freq_twitter_3 <- freq_twitter_3[freq_twitter_3$frequency > 1,]
freq_news_3    <- freq_news_3[freq_news_3$frequency > 1,]
freq_blogs_3   <- freq_blogs_3[freq_blogs_3$frequency > 1,]

message(sprintf("Total size after pruning the elements with a frequency of one : %d",
                dim(freq_twitter_3)[1]+dim(freq_news_3)[1]+dim(freq_blogs_3)[1]))
## Total size after pruning the elements with a frequency of one : 6249408

It was especially important with the trigrams to prune the single instance trigrams from the data set, note that the final size was about 5% of the initial size.

Histograms

The following histograms show graphically how often each of the top frequency words, bigrams, and trigrams appear in the files. The top 25 are shown here, but it is clear that the top few elements of each are significantly greater frequency than the rest.

# Twitter 1-grams
freq_twitter_1a <- freq_twitter_1[1:25,]
freq_twitter_1a <- arrange(freq_twitter_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_twitter_1a$frequency, names.arg = freq_twitter_1a$lastTerm, horiz = TRUE, las=1,
        main = "Twitter 1-grams", xlab = "Frequency")

# Twitter bigrams
freq_twitter_2a <- freq_twitter_2[1:25,]
freq_twitter_2a <- arrange(freq_twitter_2a, frequency)
freq_twitter_2a$label <- paste(freq_twitter_2a$firstTerms, freq_twitter_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_twitter_2a$frequency, names.arg = freq_twitter_2a$label, horiz = TRUE, las=1,
        main = "Twitter Bigrams", xlab = "Frequency")

# Twitter trigrams
freq_twitter_3a <- freq_twitter_3[1:25,]
freq_twitter_3a <- arrange(freq_twitter_3a, frequency)
freq_twitter_3a$label <- paste(freq_twitter_3a$firstTerms, freq_twitter_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_twitter_3a$frequency, names.arg = freq_twitter_3a$label, horiz = TRUE, las=1,
        main = "Twitter Trigrams", xlab = "Frequency")

# News 1-grams
freq_news_1a <- freq_news_1[1:25,]
freq_news_1a <- arrange(freq_news_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_news_1a$frequency, names.arg = freq_news_1a$lastTerm, horiz = TRUE, las=1,
        main = "News 1-grams", xlab = "Frequency")

# News bigrams
freq_news_2a <- freq_news_2[1:25,]
freq_news_2a <- arrange(freq_news_2a, frequency)
freq_news_2a$label <- paste(freq_news_2a$firstTerms, freq_news_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_news_2a$frequency, names.arg = freq_news_2a$label, horiz = TRUE, las=1,
        main = "News Bigrams", xlab = "Frequency")

# News trigrams
freq_news_3a <- freq_news_3[1:25,]
freq_news_3a <- arrange(freq_news_3a, frequency)
freq_news_3a$label <- paste(freq_news_3a$firstTerms, freq_news_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_news_3a$frequency, names.arg = freq_news_3a$label, horiz = TRUE, las=1,
        main = "News Trigrams", xlab = "Frequency")

# Blogs 1-grams
freq_blogs_1a <- freq_blogs_1[1:25,]
freq_blogs_1a <- arrange(freq_blogs_1a, frequency)
par(mai=c(1,2,1,1))
barplot(freq_blogs_1a$frequency, names.arg = freq_blogs_1a$lastTerm, horiz = TRUE, las=1,
        main = "Blogs 1-grams", xlab = "Frequency")

# Blogs bigrams
freq_blogs_2a <- freq_blogs_2[1:25,]
freq_blogs_2a <- arrange(freq_blogs_2a, frequency)
freq_blogs_2a$label <- paste(freq_blogs_2a$firstTerms, freq_blogs_2a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_blogs_2a$frequency, names.arg = freq_blogs_2a$label, horiz = TRUE, las=1,
        main = "Blogs Bigrams", xlab = "Frequency")

# Blogs trigrams
freq_blogs_3a <- freq_blogs_3[1:25,]
freq_blogs_3a <- arrange(freq_blogs_3a, frequency)
freq_blogs_3a$label <- paste(freq_blogs_3a$firstTerms, freq_blogs_3a$lastTerm,sep = "_")
par(mai=c(1,2,1,1))
barplot(freq_blogs_3a$frequency, names.arg = freq_blogs_3a$label, horiz = TRUE, las=1,
        main = "Blogs Trigrams", xlab = "Frequency")