Summary

This is the Milestone Report for the Data Science Capstone. We are looking over a large set of text and mining it to develop a predictive text algorithm. But before we begin there, like any Data Scientist, we should start by understanding our problem, getting and cleaning our data, and then doing some exploratory data analysis

The data we are using can be found: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Acquiring the data

We’ll be using the following packages

library(tm)
library(RWeka)
library(knitr)
library(ggplot2)
library(stringi)
library(caTools)

Now let’s load in the data we previously downloaded

con <- file("en_US.blogs.txt", open = "r")
enblog <- readLines(con)
close(con)

con <- file("en_US.news.txt", open = "r")
ennews <- readLines(con)
close(con)

con <- file("en_US.twitter.txt", open = "r")
entweet <- readLines(con)
close(con)

Excellent! Now we can begin looking into basic summaries of the three files we just read in. First let’s look at file size

as.numeric(object.size(enblog) / 1024 ^2)
as.numeric(object.size(ennews) / 1024 ^2)
as.numeric(object.size(entweet) / 1024 ^2)

##  Blog File Size: 248.493500
##  News File Size: 249.632881
##  Tweets File Size: 301.396698

How many lines is in each file?

length(enblog)
length(ennews)
length(entweet)

##  Number of lines in blogs: 899288
##  Number of lines in news: 1010242
##  Number of lines in tweets: 2360148

How many words are in each dataset?

sum(stri_count_words(enblog))
sum(stri_count_words(ennews))
sum(stri_count_words(entweet))

##  Number of words in blogs: 37546246
##  Number of words in news: 34762395
##  Number of words in tweets: 30093369

Sampling and Cleaning Data

Now that we’ve taken an initial look at the data, we know we’re dealing with a massive amount of text. Let’s sample it first so we get a representative portion. This will also allow us to train our data using this sample and test against the rest later. I decided to do a 0.25% sampling just to get a look at the data.

# sample data

set.seed(4851)
sample_blog <- sample.split(enblog, SplitRatio= .0025, group = NULL)
sub_blog <- enblog[sample_blog]

sample_news <- sample.split(ennews, SplitRatio =.0025, group = NULL)
sub_news <- ennews[sample_news]

sample_tweet <- sample.split(entweet, SplitRatio = .0025, group = NULL)
sub_tweet <- entweet[sample_tweet]

Now we will cominbe these separate samples into one Corpus.

options(mc.cores=1)
sample_txt <- c(sub_blog, sub_news, sub_tweet)
train_corpus <- VCorpus(VectorSource(sample_txt))

Now we want to clean the data by removing whitespace, numbers, punctuation, and changing all the characters to lower case

train_corpus <- tm_map(train_corpus, stripWhitespace)
train_corpus <- tm_map(train_corpus, removeNumbers)
train_corpus <- tm_map(train_corpus, removePunctuation)
train_corpus <- tm_map(train_corpus, content_transformer(tolower))

Now that we have done, let’s take a more in-depth look at the words of these samples

blog_word <- stri_count_words(as.character(sub_blog))
news_word <- stri_count_words(as.character(sub_news))
tweet_word <- stri_count_words(as.character(sub_tweet))

The charts:

ggplot(NULL, aes(x = blog_word, y = ..density..)) +
  geom_histogram(fill = 'blue', color = 'black', binwidth = 25) +
  xlab('# of words') +
  ylab('%') +
  ggtitle('Word count/blog post') +
  theme(plot.title = element_text(size = rel(1.5)))

ggplot(NULL, aes(x = news_word, y = ..density..)) +
  geom_histogram(fill = 'green', color = 'black', binwidth = 25) +
  xlab('# of words') +
  ylab('%') +
  ggtitle('Word count/news article') +
  theme(plot.title = element_text(size = rel(1.5)))

ggplot(NULL, aes(x = tweet_word, y = ..density..)) +
  geom_histogram(fill = 'yellow', color = 'black', binwidth = 1) +
  xlab('# of words') +
  ylab('%') +
  ggtitle('Word count/tweet') +
  theme(plot.title = element_text(size = rel(1.5)))

Tokenize

Now that we have a decent sample all cleaned, let’s investigate the frequency of uni, bi, and trigrams we have. First we have to create a few functiosn to that work.

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 3, max = 3))

tdm_uniGram <- TermDocumentMatrix(train_corpus, control = list(tokenize = uniGramTokenizer))
tdm_biGram <- TermDocumentMatrix(train_corpus, control = list(tokenize = biGramTokenizer))
tdm_triGram <- TermDocumentMatrix(train_corpus, control = list(tokenize = triGramTokenizer))

UniGrams

freq_uniGram <- findFreqTerms(tdm_uniGram, 750)
frequency_uniGram <- rowSums(as.matrix(tdm_uniGram[freq_uniGram,]))
frequency_uniGram <- data.frame(unigram = names(frequency_uniGram), frequency = frequency_uniGram)
frequency_uniGram <- frequency_uniGram[with(frequency_uniGram, order(frequency, decreasing = TRUE)),]
frequency_uniGram$prob <- frequency_uniGram$frequency/sum(frequency_uniGram$frequency)


ggplot(frequency_uniGram, 
  aes(x = reorder(unigram, frequency), y = frequency)) +
  geom_bar(stat = "identity", fill="purple", color = "black") +
  xlab("uniGram") + ylab("Frequency") +
  labs(title = "Top Unigrams") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

BiGrams

freq_biGram <- findFreqTerms(tdm_biGram, 150)
frequency_biGram <- rowSums(as.matrix(tdm_biGram[freq_biGram,]))
frequency_biGram <- data.frame(bigram = names(frequency_biGram), frequency = frequency_biGram)
frequency_biGram <- frequency_biGram[with(frequency_biGram, order(frequency, decreasing = TRUE)),]
frequency_biGram$prob <- frequency_biGram$frequency/sum(frequency_biGram$frequency)


ggplot(frequency_biGram, 
  aes(x = reorder(bigram, frequency), y = frequency)) +
  geom_bar(stat = "identity", fill="orange", color = "black") +
  xlab("biGram") + ylab("Frequency") +
  labs(title = "Top Bigrams") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

TriGrams

freq_triGram <- findFreqTerms(tdm_triGram, 23)
frequency_triGram <- rowSums(as.matrix(tdm_triGram[freq_triGram,]))
frequency_triGram <- data.frame(trigram = names(frequency_triGram), frequency = frequency_triGram)
frequency_triGram <- frequency_triGram[with(frequency_triGram, order(frequency, decreasing = TRUE)),]
frequency_triGram$prob <- frequency_triGram$frequency/sum(frequency_triGram$frequency)

ggplot(frequency_triGram, 
  aes(x = reorder(trigram, frequency), y = frequency)) +
  geom_bar(stat = "identity", fill="red", color = "black") +
  xlab("triGram") + ylab("Frequency") +
  labs(title = "Top Trigrams") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Next steps

There’s a lot left to do even now. * I’d like to use a larger sample, which requires more time to take in all this data * Develop and tune the model to start predicting words based on the 1 or 2 words preceeding it. * Investigate 4-grams to see if that is at all interesting.

Much to do, very excited to continue on!

DS Capstsone Milestone Report

Quang Nguyen

July 26, 2015