Natural Language Processing

Summary

The purpose of this document is to describe the process to prepare data to build a text predecitve model. The provided training data was obtained from three types of sources: blog, news and twitter. The described porcess in this documents includes:

  • Data reading by R
  • Data cleaning
  • Data exploratory

1.Loading required libraries and Data

#load libraries
library(ggplot2)
library(stringi)
library(knitr)
library(tm)
library(dplyr)
library(RWeka)
#load data
blog<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.blogs.txt", skipNul = TRUE)

news<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.news.txt",skipNul = TRUE)

twit<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.twitter.txt", skipNul = TRUE)

2. Metadata of Training set

x<-c("blog", "news", "twitter")

#File Size
sz.blog<-object.size(blog)
sz.news<-object.size(news)
sz.twit<-object.size(twit)

#Number of Lines
ln.blog<-length(blog)
ln.news<-length(news)
ln.twit<-length(twit)

#Number of Words
W.blog<-sum(stri_count_words(blog))
W.news<-sum(stri_count_words(news))
W.twit<-sum(stri_count_words(twit))



table<-data.frame(type=x, Filesize= c(sz.blog, sz.news, sz.twit), NumberofLines=c(ln.blog, ln.news, ln.twit), Numberofwords=c(W.blog, W.news, W.twit))

kable(table)
type Filesize NumberofLines Numberofwords
blog 267758632 899288 38154238
news 20729472 77259 2693898
twitter 334484736 2360148 30218125
***

3.Cleaning The Data

set.seed(12345)
#sample each data type
blog_sample<-sample(blog, length(blog)*0.001)
news_sample<-sample(news, length(news)*0.001)
twit_sample<-sample(twit, length(twit)*0.001)

#add all samples in one dataset
all_sample<-c(blog_sample, news_sample, twit_sample)

corp<-VCorpus(VectorSource(all_sample))%>%tm_map(removeNumbers)%>%tm_map(stripWhitespace)%>%tm_map(content_transformer(tolower))%>%tm_map(removePunctuation)

f <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corp <- tm_map(corp, f, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corp <- tm_map(corp, f, "@[^\\s]+")
format(object.size(corp), units="Mb")
## [1] "13.7 Mb"

create document matrix

dm<-TermDocumentMatrix(corp)
dm
## <<TermDocumentMatrix (terms: 124827, documents: 166833)>>
## Non-/sparse entries: 2354995/20822907896
## Sparsity           : 100%
## Maximal term length: 351
## Weighting          : term frequency (tf)
findFreqTerms(dm, lowfreq = 10000) 
##  [1] "about" "all"   "and"   "are"   "but"   "for"   "from"  "have" 
##  [9] "just"  "like"  "not"   "one"   "out"   "that"  "the"   "they" 
## [17] "this"  "was"   "what"  "will"  "with"  "you"   "your"

4. nth gram analysis

unigram <- function(z) NGramTokenizer(z, Weka_control(min=1, max=1))
bigram <- function(z) NGramTokenizer(z, Weka_control(min=2, max=2))
trigram <- function(z) NGramTokenizer(z, Weka_control(min=3, max=3))
                                      
uniGram <- DocumentTermMatrix(corp, control=list(tokenize=unigram))

uniFreq <- sort(colSums(as.matrix(uniGram)),decreasing = TRUE)

uni_Freq <- data.frame(term=names(uniFreq), occur=uniFreq)

biGram <- DocumentTermMatrix(corp, control=list(tokenize=bigram))

biFreq <- sort(colSums(as.matrix(biGram)),decreasing = TRUE)
bi_Freq <- data.frame(term=names(biFreq), occur=biFreq)

triGram <- DocumentTermMatrix(corp, control=list(tokenize=trigram))
triFreq <- sort(colSums(as.matrix(triGram)),decreasing = TRUE)
tri_Freq <- data.frame(term=names(triFreq), occur=triFreq)

5. Visualize word frequency

#unigram 
 ggplot(uni_Freq[uni_Freq$occur > 200 ,], aes(x = factor(term), y = occur))+ggtitle("Uni-gram:  Terms with Frequency > 200") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity", color="blue") + theme(axis.text.x=element_text(angle=45, hjust=1))

#bigram 
 ggplot(bi_Freq[bi_Freq$occur > 50 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram:  Terms with Frequency > 50") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity",  color="red") + theme(axis.text.x=element_text(angle=45, hjust=1))

#Trigram 
 ggplot(tri_Freq[tri_Freq$occur > 10 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram:  Terms with Frequency > 10") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity",  color="green") + theme(axis.text.x=element_text(angle=45, hjust=1)) 

6. Conclusion

The data, downloaded, cleaned, explored will be used to build a text predictive shiny app after the R code required for text prediction is completed.