Building a simple N-Gram predictive model for text mining application

Summary

In this project the goal is demostrate my basic understanding of the data provided. This is the first step in building a full N-Gram predictive model. The following report will show my understanding of the data, basic analisys, plots to demostrate the basic model.

show loading of the data
report the basic stats of the data files
show any interesting findings during my analysis
Report the results of the basic model

Loading the data files

location="F:/Home/coursera/projects/capstone/data/final/en_US/" #use the current working directory

#read the US_blog
blogs<- readLines(file.path(location,"en_US.blogs.txt"),skipNul = TRUE)

#read the US news
news<- readLines(file.path(location,"en_US.news.txt"),skipNul = TRUE)

#read the twitter
twitter<- readLines(file.path(location,"en_US.twitter.txt"),skipNul = TRUE)

Basic report of summary statistics

#data size
data.size=matrix(c(length(blogs),length(news),length(twitter),
                    format(object.size(blogs),units="auto"),format(object.size(news),units="auto"),format(object.size(twitter),units="auto"),
                    sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter))),
                    nrow=3,ncol=3,byrow=FALSE, dimnames=list(c("blogs","news","twitter"),c("lines","size in Mb","characters")))
#print data size
data.size

##         lines     size in Mb characters 
## blogs   "899288"  "255.4 Mb" "208361438"
## news    "77259"   "19.8 Mb"  "15683765" 
## twitter "2360148" "319 Mb"   "162385035"

Removing non-English words

#lets set a random seed
set.seed(1234)

# remove non-english words
blogs <- iconv(blogs,"latin1","ASCII",sub = "")
news <- iconv(news,"latin1","ASCII",sub = "")
twitter <- iconv(twitter,"latin1","ASCII",sub = "")

Creating a traning set from the data

#sample a training data set from each file
blogs<-blogs[rbinom(blogs,size=1, prob=0.98)==0]
news<-news[rbinom(news,size=1, prob=0.98)==0]
twitter<-twitter[rbinom(twitter,size=1, prob=0.98)==0]

#create a main text by combining all 3 text
trainingtext<-c(blogs,news,twitter)

# Load the data as a corpus
training <- VCorpus(VectorSource(trainingtext))

Text transformation

#Text transformation
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
training <- tm_map(training, toSpace, "/")
training <- tm_map(training, toSpace, "@")
training <- tm_map(training, toSpace, "\\|")

# Convert the text to lower case
training <- tm_map(training, content_transformer(tolower))

# Remove numbers
training <- tm_map(training, removeNumbers)

# Remove english common stopwords
training <- tm_map(training, removeWords, stopwords("english"))

# Remove your own stop word

# specify your stopwords as a character vector
#training <- tm_map(training, removeWords, c("bla"))

# Remove punctuations
training <- tm_map(training, removePunctuation)

# Eliminate extra white spaces
training <- tm_map(training, stripWhitespace)

# Text stemming
 training <- tm_map(training, stemDocument)
 
 # convert to plain text
 training <- tm_map(training, PlainTextDocument)

N-Gram creation

 ngram1<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE))) 
 
 ngram2<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))) 
 
 ngram3<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE))) 
 
 #get the terms frequency
 ngram1.freq<-rowSums(as.matrix(ngram1[findFreqTerms(ngram1,lowfreq = 50),]))
 ngram1.freq<-data.frame(term=names(ngram1.freq),freq=ngram1.freq)
 
 ngram2.freq<-rowSums(as.matrix(ngram2[findFreqTerms(ngram2,lowfreq = 50),]))
 ngram2.freq<-data.frame(term=names(ngram2.freq),freq=ngram2.freq)

 ngram3.freq<-rowSums(as.matrix(ngram3[findFreqTerms(ngram3,lowfreq = 50),]))
 ngram3.freq<-data.frame(term=names(ngram3.freq),freq=ngram3.freq)

#sort the data by the number of frequency
ngram1.ordered=plyr::arrange(ngram1.freq,-ngram1.freq$freq)
ngram2.ordered=plyr::arrange(ngram2.freq,-ngram2.freq$freq) 
ngram3.ordered=plyr::arrange(ngram3.freq,-ngram3.freq$freq)

ngram1.ordered=ngram1.ordered[1:20,]
ngram2.ordered=ngram2.ordered[1:20,]
ngram3.ordered=ngram3.ordered[1:20,]

mgram1.plot <- ggplot(ngram1.ordered, aes(x = term, y = freq))
mgram1.plot <- mgram1.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 1-gram")

mgram1.plot

mgram2.plot <- ggplot(ngram2.ordered, aes(x = term, y = freq))
mgram2.plot <- mgram2.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 2-gram")

mgram2.plot

mgram3.plot <- ggplot(ngram3.ordered, aes(x = term, y = freq))
mgram3.plot <- mgram3.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 3-gram")

mgram3.plot