Natural Language Processing
Summary
The purpose of this document is to describe the process to prepare data to build a text predecitve model. The provided training data was obtained from three types of sources: blog, news and twitter. The described porcess in this documents includes:
- Data reading by R
- Data cleaning
- Data exploratory
1.Loading required libraries and Data
#load libraries
library(ggplot2)
library(stringi)
library(knitr)
library(tm)
library(dplyr)
library(RWeka)
#load data
blog<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.blogs.txt", skipNul = TRUE)
news<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.news.txt",skipNul = TRUE)
twit<-readLines("C:/Users/AdhamSmallPC/Documents/final/en_US/en_US.twitter.txt", skipNul = TRUE)
3.Cleaning The Data
set.seed(12345)
#sample each data type
blog_sample<-sample(blog, length(blog)*0.001)
news_sample<-sample(news, length(news)*0.001)
twit_sample<-sample(twit, length(twit)*0.001)
#add all samples in one dataset
all_sample<-c(blog_sample, news_sample, twit_sample)
corp<-VCorpus(VectorSource(all_sample))%>%tm_map(removeNumbers)%>%tm_map(stripWhitespace)%>%tm_map(content_transformer(tolower))%>%tm_map(removePunctuation)
f <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corp <- tm_map(corp, f, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corp <- tm_map(corp, f, "@[^\\s]+")
format(object.size(corp), units="Mb")
## [1] "13.7 Mb"
create document matrix
dm<-TermDocumentMatrix(corp)
dm
## <<TermDocumentMatrix (terms: 124827, documents: 166833)>>
## Non-/sparse entries: 2354995/20822907896
## Sparsity : 100%
## Maximal term length: 351
## Weighting : term frequency (tf)
findFreqTerms(dm, lowfreq = 10000)
## [1] "about" "all" "and" "are" "but" "for" "from" "have"
## [9] "just" "like" "not" "one" "out" "that" "the" "they"
## [17] "this" "was" "what" "will" "with" "you" "your"
4. nth gram analysis
unigram <- function(z) NGramTokenizer(z, Weka_control(min=1, max=1))
bigram <- function(z) NGramTokenizer(z, Weka_control(min=2, max=2))
trigram <- function(z) NGramTokenizer(z, Weka_control(min=3, max=3))
uniGram <- DocumentTermMatrix(corp, control=list(tokenize=unigram))
uniFreq <- sort(colSums(as.matrix(uniGram)),decreasing = TRUE)
uni_Freq <- data.frame(term=names(uniFreq), occur=uniFreq)
biGram <- DocumentTermMatrix(corp, control=list(tokenize=bigram))
biFreq <- sort(colSums(as.matrix(biGram)),decreasing = TRUE)
bi_Freq <- data.frame(term=names(biFreq), occur=biFreq)
triGram <- DocumentTermMatrix(corp, control=list(tokenize=trigram))
triFreq <- sort(colSums(as.matrix(triGram)),decreasing = TRUE)
tri_Freq <- data.frame(term=names(triFreq), occur=triFreq)
5. Visualize word frequency
#unigram
ggplot(uni_Freq[uni_Freq$occur > 200 ,], aes(x = factor(term), y = occur))+ggtitle("Uni-gram: Terms with Frequency > 200") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity", color="blue") + theme(axis.text.x=element_text(angle=45, hjust=1))

#bigram
ggplot(bi_Freq[bi_Freq$occur > 50 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram: Terms with Frequency > 50") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity", color="red") + theme(axis.text.x=element_text(angle=45, hjust=1))

#Trigram
ggplot(tri_Freq[tri_Freq$occur > 10 ,], aes(x = factor(term), y = occur))+ggtitle("Bi-gram: Terms with Frequency > 10") +xlab("Terms") + ylab("Frequency") + geom_bar(stat="identity", color="green") + theme(axis.text.x=element_text(angle=45, hjust=1))

6. Conclusion
The data, downloaded, cleaned, explored will be used to build a text predictive shiny app after the R code required for text prediction is completed.