Training Datasets

The first step is to download the three files and put them in the workding directory and read them in. The following r code show the process of reading the source files, tokenizing each line and counting the total number of words.

library(dplyr)
library(triebeard)
library(ggplot2)
library(ngram)
#Get total number of lines and total number of words in the file "en_US.news.txt"
con <- file("en_US.news.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
#for (i in 1:3) {
for (i in 1:totalLines) {
  wordCount = wordCount + length(tokenAll[[i]])
}
newsLines <- totalLines
newsWords <- wordCount
newsTokens <- tokenAll
#Get total number of lines and total number of words in the file "en_US.blogs.txt"
con <- file("en_US.blogs.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
#for (i in 1:3) {
for (i in 1:totalLines) {
  wordCount = wordCount + length(tokenAll[[i]])
}
blogsLines <- totalLines
blogsWords <- wordCount
blogsTokens <- tokenAll
#Get total number of lines and total number of words in the file "en_US.blogs.txt"
con <- file("en_US.twitter.txt", "r")
lineAll <- readLines(con)
close(con)
tokenAll <- tokenizers::tokenize_words(lineAll, lowercase=TRUE)
wordCount <- 0
totalLines <- length(lineAll)
#for (i in 1:3) {
for (i in 1:totalLines) {
  wordCount = wordCount + length(tokenAll[[i]])
}
twitterLines <- totalLines
twitterWords <- wordCount
twitterTokens <- tokenAll
summaryMatrix <- matrix( c(newsLines, newsWords, blogsLines, blogsWords, twitterLines, twitterWords),
                           nrow=3, ncol=2, byrow = TRUE)             
dimnames(summaryMatrix) = list( 
     c("en_US.news", "en_US.blogs", "en_US.Twitter"),         # row names 
     c("Number of Lines", "Number of Words"))                 # column names 
summaryMatrix
##               Number of Lines Number of Words
## en_US.news            1010242        34762395
## en_US.blogs            899288        37546246
## en_US.Twitter         2360148        30093369

Plotting the Exploratory Analyses

At this point, we created the following histograms using only the first 50 lines of each files. Otherwise, it is difficult to see what the differences between the three files.

# Taking counts of the first 50 lines of tokens of the en_US.news.txt file.
key <- concatenate(newsTokens[[1]][1], "_", collapse="")
trieNews <- trie(keys=key, values=1)
for (i in 2:length(newsTokens[[1]])) {
  key <- concatenate(newsTokens[[1]][i], "_", collapse="")
  if (is.na(longest_match(trieNews, key))) {
    trie_add(trieNews, keys=key, values=1)
  } else {
    val = longest_match(trieNews, key)
    newVal = val + 1
    trie_add(trieNews, keys=key, values=newVal)
  }
}

for (h in 2:50) {  
  for (i in 1:length(newsTokens[[h]])) {
    key <- concatenate(newsTokens[[h]][i], "_", collapse="")
    if (is.na(longest_match(trieNews, key))) {
      trie_add(trieNews, keys=key, values=1)
    } else {
      val = longest_match(trieNews, key)
      newVal = val + 1
      trie_add(trieNews, keys=key, values=newVal)
    }
  }
}
# Taking counts of the first 50 lines of tokens of the en_US.blogs.txt file.
key <- concatenate(blogsTokens[[1]][1], "_", collapse="")
trieBlogs <- trie(keys=key, values=1)
for (i in 2:length(blogsTokens[[1]])) {
  key <- concatenate(blogsTokens[[1]][i], "_", collapse="")
  if (is.na(longest_match(trieBlogs, key))) {
    trie_add(trieBlogs, keys=key, values=1)
  } else {
    val = longest_match(trieBlogs, key)
    newVal = val + 1
    trie_add(trieBlogs, keys=key, values=newVal)
  }
}

for (h in 2:50) {  
  for (i in 1:length(blogsTokens[[h]])) {
    key <- concatenate(blogsTokens[[h]][i], "_", collapse="")
    if (is.na(longest_match(trieBlogs, key))) {
      trie_add(trieBlogs, keys=key, values=1)
    } else {
      val = longest_match(trieBlogs, key)
      newVal = val + 1
      trie_add(trieBlogs, keys=key, values=newVal)
    }
  }
}
# Taking counts of the first 50 lines of tokens of the en_US.twitters.txt file.
key <- concatenate(twitterTokens[[1]][1], "_", collapse="")
trieTwitter <- trie(keys=key, values=1)
for (i in 2:length(twitterTokens[[1]])) {
  key <- concatenate(twitterTokens[[1]][i], "_", collapse="")
  if (is.na(longest_match(trieTwitter, key))) {
    trie_add(trieTwitter, keys=key, values=1)
  } else {
    val = longest_match(trieTwitter, key)
    newVal = val + 1
    trie_add(trieTwitter, keys=key, values=newVal)
  }
}

for (h in 2:50) {  
  for (i in 1:length(twitterTokens[[h]])) {
    key <- concatenate(twitterTokens[[h]][i], "_", collapse="")
    if (is.na(longest_match(trieTwitter, key))) {
      trie_add(trieTwitter, keys=key, values=1)
    } else {
      val = longest_match(trieTwitter, key)
      newVal = val + 1
      trie_add(trieTwitter, keys=key, values=newVal)
    }
  }
}
dfNews <- as.data.frame(trieNews)
dfBlogs <- as.data.frame(trieBlogs)
dfTwitter <- as.data.frame(trieTwitter)

qplot(dfNews$values, geom="histogram", binwidth=0.5, fill=I("blue"), col=I("red"), 
      xlab="Number of times a word showed up", main="en_US.news.txt")

qplot(dfBlogs$values, geom="histogram", binwidth=0.5, fill=I("blue"), col=I("red"),
      xlab="Number of times a word showed up", main="en_US.blogs.txt")

qplot(dfTwitter$values, geom="histogram", binwidth=0.5, fill=I("blue"), col=I("red"),
      xlab="Number of times a word showed up", main="en_US.Twitters.txt")

What the histograms show

It appears that news and blogs files contains more words that are used repeatedly while twitter file shows that words are not repeated as much as the other files. This may be related to the fact that people use less stopwords when tweeting.

Plans for Creating a Prediction Algorithm

First of all, we would like to modify the trie data structure we are currently using in order to streamline the process to deal with millions of tokenized words. We plan to keep working with how we will deal with stopwords to make the prediction more accurate. We also would like to include the back-off model in our application.