Milestone Report

Natural Language Processing

Summary

The purpose of this document is to describe the process to prepare data to build a text predecitve model. The provided training data was obtained from three types of sources: blog, news and twitter. The described porcess in this documents includes:

Data reading by R
Data cleaning
Data exploratory

1.Loading required libraries and Data

#load libraries
library(ggplot2)
library(stringi)
library(knitr)
library(tm)
library(dplyr)
library(RWeka)

#load data
blog<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.blogs.txt", skipNul = TRUE)

news<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.news.txt",skipNul = TRUE)

twit<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.twitter.txt", skipNul = TRUE)

2. Metadata of Training set

x<-c("blog", "news", "twitter")

#File Size
sz.blog<-object.size(blog)
sz.news<-object.size(news)
sz.twit<-object.size(twit)

#Number of Lines
ln.blog<-length(blog)
ln.news<-length(news)
ln.twit<-length(twit)

#Number of Words
W.blog<-sum(stri_count_words(blog))
W.news<-sum(stri_count_words(news))
W.twit<-sum(stri_count_words(twit))



table<-data.frame(type=x, Filesize= c(sz.blog, sz.news, sz.twit), NumberofLines=c(ln.blog, ln.news, ln.twit), Numberofwords=c(W.blog, W.news, W.twit))

kable(table)

type	Filesize	NumberofLines	Numberofwords
blog	267758632	899288	38154238
news	20729472	77259	2693898
twitter	334484736	2360148	30218125
***

3.Cleaning The Data

set.seed(12345)
#sample each data type
blog_sample<-sample(blog, length(blog)*0.001)
news_sample<-sample(news, length(news)*0.001)
twit_sample<-sample(twit, length(twit)*0.001)

#add all samples in one dataset
all_sample<-c(blog_sample, news_sample, twit_sample)

corp<-VCorpus(VectorSource(all_sample))%>%tm_map(removeNumbers)%>%tm_map(stripWhitespace)%>%tm_map(content_transformer(tolower))%>%tm_map(removePunctuation)

f <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corp <- tm_map(corp, f, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corp <- tm_map(corp, f, "@[^\\s]+")
format(object.size(corp), units="Mb")

## [1] "13.7 Mb"

create document matrix

dm<-TermDocumentMatrix(corp)
dm

## <<TermDocumentMatrix (terms: 124827, documents: 166833)>>
## Non-/sparse entries: 2354995/20822907896
## Sparsity           : 100%
## Maximal term length: 351
## Weighting          : term frequency (tf)

findFreqTerms(dm, lowfreq = 10000)

##  [1] "about" "all"   "and"   "are"   "but"   "for"   "from"  "have" 
##  [9] "just"  "like"  "not"   "one"   "out"   "that"  "the"   "they" 
## [17] "this"  "was"   "what"  "will"  "with"  "you"   "your"

4. nth gram analysis

unigram <- function(z) NGramTokenizer(z, Weka_control(min=1, max=1))
bigram <- function(z) NGramTokenizer(z, Weka_control(min=2, max=2))
trigram <- function(z) NGramTokenizer(z, Weka_control(min=3, max=3))
                                      
uniGram <- DocumentTermMatrix(corp, control=list(tokenize=unigram))

uniFreq <- sort(colSums(as.matrix(uniGram)),decreasing = TRUE)

uni_Freq <- data.frame(term=names(uniFreq), occur=uniFreq)

biGram <- DocumentTermMatrix(corp, control=list(tokenize=bigram))

biFreq <- sort(colSums(as.matrix(biGram)),decreasing = TRUE)
bi_Freq <- data.frame(term=names(biFreq), occur=biFreq)

triGram <- DocumentTermMatrix(corp, control=list(tokenize=trigram))
triFreq <- sort(colSums(as.matrix(triGram)),decreasing = TRUE)
tri_Freq <- data.frame(term=names(triFreq), occur=triFreq)

5. Visualize word frequency

#unigram 
 ggplot(uni_Freq[uni_Freq$occur > 200 ,], aes(x = factor(term), y = occur))+ggtitle("Uni-gram:  Terms with Frequency > 200") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity", color="blue") + theme(axis.text.x=element_text(angle=45, hjust=1))

#bigram 
 ggplot(bi_Freq[bi_Freq$occur > 50 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram:  Terms with Frequency > 50") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity",  color="red") + theme(axis.text.x=element_text(angle=45, hjust=1))

#Trigram 
 ggplot(tri_Freq[tri_Freq$occur > 10 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram:  Terms with Frequency > 10") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity",  color="green") + theme(axis.text.x=element_text(angle=45, hjust=1))

Milestone Report

AAleid

January 7, 2019

Natural Language Processing

Summary

The purpose of this document is to describe the process to prepare data to build a text predecitve model. The provided training data was obtained from three types of sources: blog, news and twitter. The described porcess in this documents includes:

1.Loading required libraries and Data

2. Metadata of Training set

3.Cleaning The Data

create document matrix

4. nth gram analysis

5. Visualize word frequency

6. Conclusion

The data, downloaded, cleaned, explored will be used to build a text predictive shiny app after the R code required for text prediction is completed.