Synopsis

This is the milestone report for week 2 of the data science capstone project.

The purpose of this report is to develop and understanding of the properties of various data sets that can be used to build a prediction model for the final Shiny application. This report aims to describe the major features of the training data then summarize future plans.

The three sources of text data include Blogs, News, and Twitter. These sources will train the model.

Setting up the environment

Load the inital packages and clear the global workspace.

install.packages("knitr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
install.packages("tm", repos = "http://cran.us.r-project.org")
## 
##   There is a binary version available but the source version is later:
##    binary source needs_compilation
## tm  0.7-8 0.7-10              TRUE
## installing the source package 'tm'
install.packages("RWeka", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
library(knitr)
library(tm)
## Loading required package: NLP
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Storing the data

#As the data set is too large to load, I will instead read the first 20000 lines. Also, this will only apply to the english data. 

blogs <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.blogs.txt", 20000)
news <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.news.txt", 20000)
twitter <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.twitter.txt", 20000)
sampledDocs <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)

documents <- Corpus(VectorSource(sampledDocs))
rm(sampledDocs)

Cleaning the data

##capital letters to lowercase letters
documents <- tm_map(documents, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(documents, content_transformer(tolower)):
## transformation drops documents
#remove the whitespace
documents <- tm_map(documents, stripWhitespace)
## Warning in tm_map.SimpleCorpus(documents, stripWhitespace): transformation drops
## documents
#remove all punctuation
documents <- tm_map(documents, removePunctuation)
## Warning in tm_map.SimpleCorpus(documents, removePunctuation): transformation
## drops documents
#remove numbers
documents <- tm_map(documents, removeNumbers)
## Warning in tm_map.SimpleCorpus(documents, removeNumbers): transformation drops
## documents

A factor to consider here is the profanity used in these datasets. They may or may not improve the prediction capability of our model.

Tokenize sample into unigrams, bigrams, and trigrams

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = uniGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom tokenizer is ignored
biGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = biGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom tokenizer is ignored
triGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = triGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom tokenizer is ignored

Graphing frequently occuring n-grams

freqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 2000)
termFrequency <-rowSums(as.matrix(uniGramMatrix[freqTerms,]))
termFrequency <- data.frame(unigram=names(termFrequency), frequency=termFrequency)

g <- ggplot(termFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top Unigrams by Frequency")
print(g)

freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 2000)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)

g <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Bigram") + ylab("Frequency") +
    labs(title = "Top Bigrams by Frequency")
print(g)

freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 2000)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)

g <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top Trigrams by Frequency")
print(g)