Introduction

This paper is to report the exploratory analysis I have achieved linked to the 3 en_US text files we have been given to eventually develop a prediction algorithm and data product for.I have the 3 text files stored on my personal machine and they were downloaded from Coursera Capstone Project Data (https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip). • en_US.twitter.txt • en_US.blogs.txt • en_US.news.txt

Importing Data and Basic Information

Loading Required Data After Extracted to local drive

library(tm)
## Loading required package: NLP
blogs_file="C:/ALib/training/coursera/datascience2/en_US.blogs.txt"
news_file="C:/ALib/training/coursera/datascience2/en_US.news.txt"
twitter_file="C:/ALib/training/coursera/datascience2/en_US.twitter.txt"
usblogs = readLines(blogs_file)
usnews = readLines(news_file)
## Warning in readLines(news_file): incomplete final line found on 'C:/ALib/
## training/coursera/datascience2/en_US.news.txt'
ustwitter = readLines(twitter_file)
## Warning in readLines(twitter_file): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter_file): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter_file): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter_file): line 1759032 appears to contain an embedded
## nul
files = c(blogs_file,news_file,twitter_file)
mbsizes = sapply(files, function(x) {file.size(x)/1024^2})
library(stringr)
stats = sapply(list(usblogs,usnews,ustwitter),function(x){ c(length(x) , sum(str_count(x,'\\S+')) )})
invisible(gc())
stats = rbind(mbsizes,stats)
stats = as.data.frame(stats)
names(stats) = c('usblogs','usnews','ustwitter')
row.names(stats) = c('filesize(MB)','lines','word_count')
library(knitr)
kable(stats,digits = 0)
usblogs usnews ustwitter
filesize(MB) 200 196 159
lines 899288 77259 2360148
word_count 37334441 2643972 30373792
## Build and Cl ean Corpus

We’ll build and clean the corpus. Due to the raw texts are too large, we’ll only sample 0.1% of each txt (i.e. blogs, news, twitter)

usblogs = usblogs[seq(1,length(usblogs),1000)]
usnews = usnews[seq(1,length(usnews),1000)]
ustwitter = ustwitter[seq(1,length(ustwitter),1000)]
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter)
invisible(gc())

Next, we’ll build the corpus,

make a volatile corpus

raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())

Next, We’ll then clean the corpus: convert words to lower case, remove white spaces, punctuations, numbers, ‘th’,url, non_ASCII characters, and repeated alphabets in a word

clean the corpus

clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
  corpus <- tm_map(corpus, stripWhitespace) # remove white space
  corpus <- tm_map(corpus, removePunctuation) # remove punctuation
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
  corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
  corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
  gc()
  return(corpus)
}
corpus <- clean_corpus(raw_corpus)
save(corpus,file='corpus.RData')

Build N-Gram model

Here, we’ll build N-Gram models consisted of uni-gram, bi-gram, and tri-gram. Additionally, top 10 word frequncies and word coverage line are also plotted out.

load('corpus.RData')

unigram

library(rJava) 
library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize = unigram))
invisible(gc())
wordMatrix1 = as.data.frame((as.matrix(  tdm1 )) ) 
invisible(gc())
v1 <- sort(rowSums(wordMatrix1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
rm(tdm1,wordMatrix1)
for(i in 1:10) gc()

word probablity

d1$prob = d1$freq/sum(d1$freq)
barplot(d1[1:10,c('prob')],names.arg = d1[1:10,'word'],main = 'Probability of uni-gram words')

# word coverage

d1$cum_prob = cumsum(d1$prob)
plot(d1$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of uni-gram words')

save(d1,file='d1.RData')

bigram

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize = bigram))
invisible(gc())
wordMatrix2 = as.matrix(  tdm2 )
wordMatrix2 = as.data.frame(wordMatrix2 ) 
invisible(gc())
v2 <- sort(rowSums(wordMatrix2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
rm(tdm2,wordMatrix2)
for(i in 1:10) gc()

word probablity

d2$prob = d2$freq/sum(d2$freq)
barplot(d2[1:10,c('prob')],names.arg = d2[1:10,'word'],main = 'probability of bi-gram words')

# word coverage

d2$cum_prob = cumsum(d2$prob)
plot(d2$cum_prob,type='l',ylab = 'cumulative probablity',main = 'coverage of bi-gram words')

save(d2,file='d2.RData')

trigram

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm3<-TermDocumentMatrix(corpus,control = list(tokenize = trigram))
invisible(gc())
wordMatrix3 = as.matrix(  tdm3 )
invisible(gc())