Exploratory analysis
The goal of this report is to describe major features of the data and describe plan for creating a prediction algorithm.
Objective
downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.
Basic Summary Line Count
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
Word counts
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
c(sum(blogs.words), sum(news.words), sum(twitter.words))
## [1] 37546239 2674536 30093413
Descriptive stat of each document
summary(nchar(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 47 156 230 329 40833
summary(nchar(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 111.0 186.0 202.4 270.0 5760.0
summary(nchar(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 37.00 64.00 68.68 100.00 140.00
Analysis
# Data will be sampled
set.seed(10000)
sampleblogs <- sample(blogs, size = length(blogs)*0.05)
samplenews <- sample(news, size = length(news)*0.05)
sampletwitter <- sample(twitter, size = length(twitter)*0.05)
data.sample <- sample(paste(sampleblogs, samplenews, sampletwitter), size = 10000, replace = TRUE)
# Generate corpus and clean the data
corpus <- Corpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
x))})
corpus<- tm_map(corpus,toSpace,"[^[:graph:]]")
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "[^[:graph:]]"):
## transformation drops documents
corpus <- tm_map(corpus, tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops
## documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus <- tm_map(corpus, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(corpus, PlainTextDocument): transformation
## drops documents
Tokenize Data and Plot N-gram Frequency
#Tokenize Data
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))
#Plot Unigram Frequency
freqTerm <- findFreqTerms(uniGramMatrix, lowfreq = 5000)
termFreq <- rowSums(as.matrix(uniGramMatrix[freqTerm,]))
termFreq <- data.frame(unigram = names(termFreq), frequency = termFreq)
p <- ggplot(termFreq, aes(x = reorder(unigram, frequency), y = frequency)) +
geom_bar(stat = "identity") + xlab("unigram") + ylab("frequency") +
labs(title = "UniGram Frequency")
print(p)
#Plot BiGram Frequency
freqTerm <- findFreqTerms(biGramMatrix, lowfreq = 1000)
termFreq <- rowSums(as.matrix(biGramMatrix[freqTerm,]))
termFreq <- data.frame(bigram = names(termFreq), frequency = termFreq)
p <- ggplot(termFreq, aes(x =reorder(bigram, frequency), y = frequency)) +
geom_bar(stat = "identity") + xlab("bigram") + ylab("frequency") +
labs(title = "BiGram Frequency")
print(p)
#Plot TriGram Frequency
freqTerm <- findFreqTerms(triGramMatrix, lowfreq = 130)
termFreq <- rowSums(as.matrix(triGramMatrix[freqTerm,]))
termFreq <- data.frame(trigram = names(termFreq), frequency = termFreq)
p <- ggplot(termFreq, aes(x = reorder(trigram, frequency), y = frequency)) +
geom_bar(stat = "identity") + xlab("trigram") + ylab("frequency") + theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs(title = "TriGram Frequency")
print(p)
Summary
The Next steps in the process is to finalize a predictive model based on the analysis above using N-gram and create the shinyapp based on that model.
The Shinyapp will allow user to input text and return a few predictive word.