Summary

In this project the goal is demostrate my basic understanding of the data provided. This is the first step in building a full N-Gram predictive model. The following report will show my understanding of the data, basic analisys, plots to demostrate the basic model.

Loading the data files

location="F:/Home/coursera/projects/capstone/data/final/en_US/" #use the current working directory

#read the US_blog
blogs<- readLines(file.path(location,"en_US.blogs.txt"),skipNul = TRUE)

#read the US news
news<- readLines(file.path(location,"en_US.news.txt"),skipNul = TRUE)

#read the twitter
twitter<- readLines(file.path(location,"en_US.twitter.txt"),skipNul = TRUE)

Basic report of summary statistics

#data size
data.size=matrix(c(length(blogs),length(news),length(twitter),
                    format(object.size(blogs),units="auto"),format(object.size(news),units="auto"),format(object.size(twitter),units="auto"),
                    sum(nchar(blogs)),sum(nchar(news)),sum(nchar(twitter))),
                    nrow=3,ncol=3,byrow=FALSE, dimnames=list(c("blogs","news","twitter"),c("lines","size in Mb","characters")))
#print data size
data.size
##         lines     size in Mb characters 
## blogs   "899288"  "255.4 Mb" "208361438"
## news    "77259"   "19.8 Mb"  "15683765" 
## twitter "2360148" "319 Mb"   "162385035"

Removing non-English words

#lets set a random seed
set.seed(1234)

# remove non-english words
blogs <- iconv(blogs,"latin1","ASCII",sub = "")
news <- iconv(news,"latin1","ASCII",sub = "")
twitter <- iconv(twitter,"latin1","ASCII",sub = "")

Creating a traning set from the data

#sample a training data set from each file
blogs<-blogs[rbinom(blogs,size=1, prob=0.98)==0]
news<-news[rbinom(news,size=1, prob=0.98)==0]
twitter<-twitter[rbinom(twitter,size=1, prob=0.98)==0]

#create a main text by combining all 3 text
trainingtext<-c(blogs,news,twitter)

# Load the data as a corpus
training <- VCorpus(VectorSource(trainingtext))

Text transformation

#Text transformation
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
training <- tm_map(training, toSpace, "/")
training <- tm_map(training, toSpace, "@")
training <- tm_map(training, toSpace, "\\|")

# Convert the text to lower case
training <- tm_map(training, content_transformer(tolower))

# Remove numbers
training <- tm_map(training, removeNumbers)

# Remove english common stopwords
training <- tm_map(training, removeWords, stopwords("english"))

# Remove your own stop word

# specify your stopwords as a character vector
#training <- tm_map(training, removeWords, c("bla"))

# Remove punctuations
training <- tm_map(training, removePunctuation)

# Eliminate extra white spaces
training <- tm_map(training, stripWhitespace)

# Text stemming
 training <- tm_map(training, stemDocument)
 
 # convert to plain text
 training <- tm_map(training, PlainTextDocument)

N-Gram creation

 ngram1<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE))) 
 
 ngram2<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))) 
 
 ngram3<-TermDocumentMatrix(training,control = list(tokenize = function(x)
   unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE))) 
 
 #get the terms frequency
 ngram1.freq<-rowSums(as.matrix(ngram1[findFreqTerms(ngram1,lowfreq = 50),]))
 ngram1.freq<-data.frame(term=names(ngram1.freq),freq=ngram1.freq)
 
 ngram2.freq<-rowSums(as.matrix(ngram2[findFreqTerms(ngram2,lowfreq = 50),]))
 ngram2.freq<-data.frame(term=names(ngram2.freq),freq=ngram2.freq)

 ngram3.freq<-rowSums(as.matrix(ngram3[findFreqTerms(ngram3,lowfreq = 50),]))
 ngram3.freq<-data.frame(term=names(ngram3.freq),freq=ngram3.freq)
#sort the data by the number of frequency
ngram1.ordered=plyr::arrange(ngram1.freq,-ngram1.freq$freq)
ngram2.ordered=plyr::arrange(ngram2.freq,-ngram2.freq$freq) 
ngram3.ordered=plyr::arrange(ngram3.freq,-ngram3.freq$freq)

ngram1.ordered=ngram1.ordered[1:20,]
ngram2.ordered=ngram2.ordered[1:20,]
ngram3.ordered=ngram3.ordered[1:20,]
mgram1.plot <- ggplot(ngram1.ordered, aes(x = term, y = freq))
mgram1.plot <- mgram1.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 1-gram")

mgram1.plot

mgram2.plot <- ggplot(ngram2.ordered, aes(x = term, y = freq))
mgram2.plot <- mgram2.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 2-gram")

mgram2.plot

mgram3.plot <- ggplot(ngram3.ordered, aes(x = term, y = freq))
mgram3.plot <- mgram3.plot + geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most Frequent: 3-gram")

mgram3.plot