capstone report

Introduction

This document explains the major features of the data and briefly summarize my plans for creating the prediction algorithm and Shiny app. The motivation for this project is to: 1. download the data and successfully load it in. 2. Create a basic report of summary statistics about the data sets. 3. Report interesting findings that I have found so far.4. Get feedback on my plans for creating a prediction algorithm and Shiny app.

Raw dataset statistical summary

Read data

setwd("/Users/lt/Desktop/data sciences/capstone/final/en_US")
txt <- dir()
con <- file(txt[1], "r")
blogs <- readLines(con)
close(con)
con <- file(txt[2], "r")
news <- readLines(con)
close(con)
con <- file(txt[3], "r")
twitter <- readLines(con)

## Warning in readLines(con): line 167155 appears to contain an embedded nul

## Warning in readLines(con): line 268547 appears to contain an embedded nul

## Warning in readLines(con): line 1274086 appears to contain an embedded nul

## Warning in readLines(con): line 1759032 appears to contain an embedded nul

close(con)
com_data <- list(blogs, news, twitter)

Descriptive summary of raw dataset

line_count <- sapply(com_data, length)
char_count <- sapply(com_data, function(x) sum(nchar(x)))
data.frame(file_name=txt,line_count,char_count)

##           file_name line_count char_count
## 1   en_US.blogs.txt     899288  206824505
## 2    en_US.news.txt    1010242  203223159
## 3 en_US.twitter.txt    2360148  162096031

Sampling dataset

Only a random subset (1% of each dataset) is used for the exploratory data analysis. They are then corpused as a corpus_sample for analysis purpose.

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tm)

## Loading required package: tm

## Loading required package: NLP

set.seed(1234)
blog_sample <- sample(blogs, size=length(blogs)*0.01)
news_sample <- sample(news, size=length(news)*0.01)
twitter_sample <- sample(twitter, size=length(twitter)*0.01)
corpus_sample <- VCorpus(VectorSource(list(blog_sample, news_sample, twitter_sample)))

Histgrams and wordclouds of sample dataset

blog sample dataset

require(tm)
require(wordcloud)

## Loading required package: wordcloud

## Loading required package: RColorBrewer

blogs.dtm <- DocumentTermMatrix(VCorpus(VectorSource(blog_sample)))
blogs.dtms <- removeSparseTerms(blogs.dtm , 0.999)
blog_freq<-sort(colSums(as.matrix(blogs.dtms)), decreasing=TRUE)
hist(blog_freq, breaks = 1000)

wordcloud(names(blog_freq), blog_freq, min.freq=100, max.words=100)

news sample dataset

require(tm)
require(wordcloud)
news.dtm <- DocumentTermMatrix(VCorpus(VectorSource(news_sample)))
news.dtms <- removeSparseTerms(news.dtm , 0.999)
news_freq<-sort(colSums(as.matrix(news.dtms)), decreasing=TRUE)
hist(news_freq, breaks = 1000)

wordcloud(names(news_freq), news_freq, min.freq=100, max.words=100)

twitter sample dataset

require(tm)
require(wordcloud)
twitter.dtm <- DocumentTermMatrix(VCorpus(VectorSource(twitter_sample)))
twitter.dtms <- removeSparseTerms(twitter.dtm , 0.999)
twitter_freq<-sort(colSums(as.matrix(twitter.dtms)), decreasing=TRUE)
hist(twitter_freq, breaks = 1000)

wordcloud(names(twitter_freq), twitter_freq, min.freq=100, max.words=100)

Preprocessing and cleaning sample dataset

Remove redundant words, numbers, punctuation, spaces in the sample dataset.

require(tm)
require(SnowballC)

## Loading required package: SnowballC

corpus_sample <- tm_map(corpus_sample, content_transformer(function(x) iconv(x, to="UTF-8",sub="byte")))
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))
corpus_sample <- tm_map(corpus_sample, content_transformer(removePunctuation))
corpus_sample <- tm_map(corpus_sample, content_transformer(removeNumbers))
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) 
corpus_sample <- tm_map(corpus_sample, content_transformer(removeURL))
corpus_sample <- tm_map(corpus_sample, stripWhitespace)
#corpus_sample <- tm_map(corpus_sample, removeWords, stopwords("english"))
#corpus_sample <- tm_map(corpus_sample, removeWords, profanityWords)
corpus_sample <- tm_map(corpus_sample, stemDocument)
corpus_sample <- tm_map(corpus_sample, stripWhitespace)

Tokenizers of Unigrams, Bigrams and Trigrams

require(RWeka)

## Loading required package: RWeka

GetNGramDataFrame <- function(corpus, ngram, spare) {
    if (ngram == 1) {
        tdm <- TermDocumentMatrix(corpus)
    } else {
        tdm <- TermDocumentMatrix(corpus, 
                    control=list(tokenize=function(x){NGramTokenizer(x, 
                                        Weka_control(min=ngram, max=ngram))}))
    }
    tdm <- removeSparseTerms(tdm,spare)
    nGramFrequency <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
    wordFrequencyDF <- data.frame(word=names(nGramFrequency), count=nGramFrequency)
}

unigramDF <- GetNGramDataFrame(corpus_sample,1,0.99)
bigramDF <- GetNGramDataFrame(corpus_sample,2,0.99)
trigramDF <- GetNGramDataFrame(corpus_sample,3,0.99)

head(unigramDF,10)

##      word count
## the   the 47959
## and   and 24315
## that that 11296
## for   for 10823
## you   you  9522
## with with  7205
## was   was  6410
## have have  5848
## this this  5462
## but   but  4897

head(bigramDF,10)

##            word count
## of the   of the  4265
## in the   in the  4148
## to the   to the  2241
## for the for the  1992
## on the   on the  1921
## to be     to be  1614
## at the   at the  1497
## and the and the  1237
## in a       in a  1185
## go to     go to  1062

head(trigramDF,10)

##                        word count
## one of the       one of the   359
## a lot of           a lot of   313
## thank for the thank for the   249
## i want to         i want to   201
## to be a             to be a   196
## go to be           go to be   178
## it was a           it was a   151
## out of the       out of the   147
## as well as       as well as   145
## some of the     some of the   144

Plots

require(ggplot2)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

p1 <- ggplot(data = head(unigramDF, n = 15), aes(reorder(word, -count), count)) +
          geom_bar(stat = "identity") + coord_flip() +
          xlab("Uni-Gram") + labs(title = "Uni-Gram Count")
p1

p2 <- ggplot(data = head(bigramDF, n = 15), aes(reorder(word, -count), count)) +
          geom_bar(stat = "identity") + coord_flip() +
          xlab("Bi-Gram") + labs(title = "Bi-Gram Count")
p2

p3 <- ggplot(data = head(trigramDF, n = 15), aes(reorder(word, -count), count)) +
          geom_bar(stat = "identity") + coord_flip() +
          xlab("Tri-Gram") + labs(title = "Tri-Gram Count")
p3

Conclusion

Interesting findings

Loading and cleaning the dataset costs a lot of time. The processing is time consuming because of the huge file size of the dataset. By avoiding endless runtimes of the code, it was indispensable to create a data sample for text mining and tokenization.

In addition, removing words from dataset is also very time-consuming. It is better to save the cleaned dataset for later use. In the prediction application, we need to make more consideration of removed words. Why or why not include. For example, words like “the”, “it”, “this”, “were”, “was”, etc. might need to be excluded.

Next steps and goals

This exploratory analysis has allowed us to understand the distribution and relationship between the words, tokens, and phrases in the text.

Due to memory limit constraint, only a random subset has been included in the analysis (1% of each data source).

The next step will consist on rebuild n-grams models, up to 4-grams and building a predictive model that given a word or a phrase as input, it will try to predict the most likely next word.

The final model will be deployed on a shiny application, so the final user can easily interact with it.