Exploratory analysis

The goal of this report is to describe major features of the data and describe plan for creating a prediction algorithm.

Objective

downloaded the data and have successfully loaded it in.

Create a basic report of summary statistics about the data sets.

Report any interesting findings that you amassed so far.

Get feedback on your plans for creating a prediction algorithm and Shiny app.

Basic Summary Line Count

length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148

Word counts

blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
c(sum(blogs.words), sum(news.words), sum(twitter.words))
## [1] 37546239  2674536 30093413

Descriptive stat of each document

summary(nchar(blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      47     156     230     329   40833
summary(nchar(news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   111.0   186.0   202.4   270.0  5760.0
summary(nchar(twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   37.00   64.00   68.68  100.00  140.00

Analysis

# Data will be sampled

set.seed(10000)
sampleblogs <- sample(blogs, size = length(blogs)*0.05)
samplenews <- sample(news, size = length(news)*0.05)
sampletwitter <- sample(twitter, size = length(twitter)*0.05)
data.sample <- sample(paste(sampleblogs, samplenews, sampletwitter), size = 10000, replace = TRUE)

# Generate corpus and clean the data
corpus <- Corpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
                                                                  x))})

corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "[^[:graph:]]"):
## transformation drops documents
corpus <- tm_map(corpus, tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops
## documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus <- tm_map(corpus, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(corpus, PlainTextDocument): transformation
## drops documents

Tokenize Data and Plot N-gram Frequency

#Tokenize Data
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))

#Plot Unigram Frequency
freqTerm <- findFreqTerms(uniGramMatrix, lowfreq = 5000)
termFreq <- rowSums(as.matrix(uniGramMatrix[freqTerm,]))
termFreq <- data.frame(unigram = names(termFreq), frequency = termFreq)

p <- ggplot(termFreq, aes(x = reorder(unigram, frequency), y = frequency)) + 
  geom_bar(stat = "identity") + xlab("unigram") + ylab("frequency") + 
  labs(title = "UniGram Frequency")
print(p)
#Plot BiGram Frequency
freqTerm <- findFreqTerms(biGramMatrix, lowfreq = 1000)
termFreq <- rowSums(as.matrix(biGramMatrix[freqTerm,]))
termFreq <- data.frame(bigram = names(termFreq), frequency = termFreq)

p <- ggplot(termFreq, aes(x =reorder(bigram, frequency), y = frequency)) + 
  geom_bar(stat = "identity") + xlab("bigram") + ylab("frequency") + 
  labs(title = "BiGram Frequency")
print(p)
#Plot TriGram Frequency
freqTerm <- findFreqTerms(triGramMatrix, lowfreq = 130)
termFreq <- rowSums(as.matrix(triGramMatrix[freqTerm,]))
termFreq <- data.frame(trigram = names(termFreq), frequency = termFreq)

p <- ggplot(termFreq, aes(x = reorder(trigram, frequency), y = frequency)) + 
  geom_bar(stat = "identity") + xlab("trigram") + ylab("frequency") + theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  labs(title = "TriGram Frequency")
print(p)

Summary

The Next steps in the process is to finalize a predictive model based on the analysis above using N-gram and create the shinyapp based on that model.

The Shinyapp will allow user to input text and return a few predictive word.