The libraries used in the report.
library(tm) # Text Minning Library
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
library(stringr) # Support to basic file summary
## Warning: package 'stringr' was built under R version 3.2.4
library(RWeka) # Support to n-gram processes
## Warning: package 'RWeka' was built under R version 3.2.4
library(stringi)
## Warning: package 'stringi' was built under R version 3.2.3
library(slam)
## Warning: package 'slam' was built under R version 3.2.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(R.utils)
## Warning: package 'R.utils' was built under R version 3.2.4
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.2.3
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 3.2.3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.20.0 (2016-02-17) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.2.0 (2015-12-09) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
Getting the data and reading it into variables
blogsCon <- file("C:/Users/sue/Documents/R/capstone/final/en_US/en_US.blogs.txt", "r")
blogs <- readLines(blogsCon, encoding="UTF-8", skipNul = TRUE)
close(blogsCon)
twitterCon <- file("C:/Users/sue/Documents/R/capstone/final/en_US/en_US.twitter.txt", "r")
twitter <- readLines(twitterCon, encoding="UTF-8", skipNul = TRUE)
close(twitterCon)
newsCon <- file("C:/Users/sue/Documents/R/capstone/final/en_US//en_US.news.txt", "r")
news <- readLines(newsCon, encoding="UTF-8", skipNul = TRUE)
## Warning in readLines(newsCon, encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'C:/Users/sue/Documents/R/capstone/final/
## en_US//en_US.news.txt'
close(newsCon)
Summary details for blogs.txt
File size
file.info("C:/Users/sue/Documents/R/capstone/final/en_US/en_US.blogs.txt")$size / (1024*1024)
## [1] 200.4242
Number of lines
countLines("C:/Users/sue/Documents/R/capstone/final/en_US/en_US.blogs.txt")
## [1] 899288
## attr(,"lastLineHasNewline")
## [1] TRUE
Cleaning the data
Sampling about 1% of each of the data source as it is very big.
# set seed for reproducible results
set.seed(1243)
blogSample <- sample(blogs, size=round(length(blogs)*0.01))
twitSample <- sample(twitter, size=round(length(twitter)*0.01))
newsSample <- sample(news, size=round(length(news)*0.01))
#Release orininal character vector
rm(list = c("blogs", "news", "twitter", "blogsCon", "newsCon", "twitterCon"))
Using the tm package to clean those unwanted characters into COrpus objects.
blogCorpus <- Corpus(VectorSource(blogSample), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
twitCorpus <- Corpus(VectorSource(twitSample), readerControl=list(reader = readPlain, language="en_US", load = TRUE))
newsCorpus <- Corpus(VectorSource(newsSample), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
#Release unused objects
rm(list = c("blogSample", "twitSample", "newsSample"))
We will apply follow cleansing processes:
- Convert non UTF8 to byte character denoted (that solve some issue with special characters in twitter source)
- Remove Punctuation
- Transform all characters to lowcase
- Strip Whitespace
- Remove Numbers
## Cleanse Data Function
cleanData <- function(x){
x <- tm_map(x,content_transformer(function(x) stri_replace_all_regex(x, "[^\\p{L}\\s[']]+","")))
x <- tm_map(x, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
x <- tm_map(x, content_transformer(removePunctuation))
x <- tm_map(x, content_transformer(tolower))
x <- tm_map(x, stripWhitespace)
x <- tm_map(x, removeNumbers)
x
}
## Cleansing Data
blogCorpus <- cleanData(blogCorpus)
twitCorpus <- cleanData(twitCorpus)
newsCorpus <- cleanData(newsCorpus)
Exploratory Analysis
After cleaning the data, an Exploratory Analysis is applied by leveraging on RWeka library to create 1-gram, 2-gram and 3-gram tokenizers
## Define Weka_Control parameters
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1, delimiters= " \\r\\n\\t.,;:\"()?!"))
binGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2, delimiters= " \\r\\n\\t.,;:\"()?!"))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3, delimiters= " \\r\\n\\t.,;:\"()?!"))
## Tokenization of Blogs Data
blogUniGramMatrix <- TermDocumentMatrix(blogCorpus, control = list(tokenize = uniGramTokenizer))
blogBinGramMatrix <- TermDocumentMatrix(blogCorpus, control = list(tokenize = binGramTokenizer))
blogTriGramMatrix <- TermDocumentMatrix(blogCorpus, control = list(tokenize = triGramTokenizer))
## Tokenization of Twitter Data
twitUniGramMatrix <- TermDocumentMatrix(twitCorpus, control = list(tokenize = uniGramTokenizer))
twitBinGramMatrix <- TermDocumentMatrix(twitCorpus, control = list(tokenize = binGramTokenizer))
twitTriGramMatrix <- TermDocumentMatrix(twitCorpus, control = list(tokenize = triGramTokenizer))
## Tokenization of News Data
newsUniGramMatrix <- TermDocumentMatrix(newsCorpus, control = list(tokenize = uniGramTokenizer))
newsBinGramMatrix <- TermDocumentMatrix(newsCorpus, control = list(tokenize = binGramTokenizer))
newsTriGramMatrix <- TermDocumentMatrix(newsCorpus, control = list(tokenize = triGramTokenizer))
Exploring the Most Frequently unigrams found in the data sets
## Function to return top n in frequence of provided ngram
topNGram <- function(tdm, n){
rollupMatrix <- as.matrix(rollup(tdm, MARGIN=2L, FUN=sum))
## MARGIN = 2L indicate the rollup will apply by row base on dimnames are our ngram elements
top_n_gram <- sort(rowSums(rollupMatrix), decreasing=TRUE)[1:n]
top_n_gram
}
topBlogUniGram <- topNGram(blogUniGramMatrix, 10)
topTwitUniGram <- topNGram(twitUniGramMatrix, 10)
topNewsUniGram <- topNGram(newsUniGramMatrix, 10)
par(mfrow=c(1,3), las = 3)
barplot(topBlogUniGram, names.arg=names(topBlogUniGram), main="Word Frequency Count from Blogs", ylab="Frequency", col="grey")
barplot(topTwitUniGram, names.arg=names(topTwitUniGram), main="Word Frequency Count from Twitters", ylab="Frequency", col="green")
barplot(topNewsUniGram, names.arg=names(topNewsUniGram), main="Word Frequency Count from News", ylab="Frequency", col="blue")

Exploring the Most Frequently bingram found the sample data sets
# Return top BinGram of each data set
topBlogBinGram <- topNGram(blogBinGramMatrix, 10)
topTwitBinGram <- topNGram(twitBinGramMatrix, 10)
topNewsBinGram <- topNGram(newsBinGramMatrix, 10)
par(mfrow=c(1,3), las = 3)
barplot(topBlogBinGram, names.arg=names(topBlogBinGram), main="Word Frequency Count from Blogs", ylab="Frequency", col="grey")
barplot(topTwitBinGram, names.arg=names(topTwitBinGram), main="Word Frequency Count from Twitters", ylab="Frequency", col="green")
barplot(topNewsBinGram, names.arg=names(topNewsBinGram), main="Word Frequency Count from News", ylab="Frequency", col="blue")

Exploring the Most Frequently trigram found the sample data sets
## Aggregate ngram in table form
topBlogTriGram <- topNGram(blogTriGramMatrix, 10)
topTwitTriGram <- topNGram(twitTriGramMatrix, 10)
topNewsTriGram <- topNGram(newsTriGramMatrix, 10)
par(mfrow=c(1,3), las = 3)
barplot(topBlogTriGram, names.arg=names(topBlogTriGram), main="Word Frequency Count from Blogs", ylab="Frequency", col="grey")
barplot(topTwitTriGram, names.arg=names(topTwitTriGram), main="Word Frequency Count from Twitters", ylab="Frequency", col="green")
barplot(topNewsTriGram, names.arg=names(topNewsTriGram), main="Word Frequency Count from News", ylab="Frequency", col="blue")

The Project Plan
In order to complete the project, the following are some step that will need to be conduct:
1. Build a predictive model based on the n-gram model provided in this report.
2. Evaluate the model for efficiency and accuracy.
3. Evaluate the model accuracy using different metrics like perplexity, accuracy at the first word, second word, and third word.
4. Explore different and new models and data to improve predictive model’s power.
5. Create a data product to show off the prediction algorithm. That will need a Shiny app accepts an n-gram and predicts the next word.
6. Create a slide deck to promote the product. within 5 slides (using RStudio Presenter) explaining the product