Text analytics is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means of statisticl pattern learning.
By analyizing the most frequent word and creating n-gram models, we can create a model to predict the next word based on letters typed on a mobile interface or text box interface.
The goal of this report is to summarize the textual data collected from twitter, blogs, news articles and prepare data for working on the prediction algorithm.
We read the twitter, blogs and news data sets into R and we create a corpus.
bl_file<-"S:/Software/R/Data Science Specialization/Capstone Project/final/en_US/en_US.blogs.txt"
blogs = file(bl_file,"r")
tw_file = "S:/Software/R/Data Science Specialization/Capstone Project/final/en_US/en_US.twitter.txt"
twitter = file(tw_file,"r")
ne_file = "S:/Software/R/Data Science Specialization/Capstone Project/final/en_US/en_US.news.txt"
news = file(ne_file,"r")
TwitterDt <- readLines(twitter, encoding = "UTF-8")
## Warning in readLines(twitter, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(twitter, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(twitter, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(twitter, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
BlogsDT <- readLines(blogs, encoding = "UTF-8")
NewsDt <- readLines(news, encoding = "UTF-8")
## Warning in readLines(news, encoding = "UTF-8"): incomplete final line found
## on 'S:/Software/R/Data Science Specialization/Capstone Project/final/en_US/
## en_US.news.txt'
library(stringi)
## Warning: package 'stringi' was built under R version 3.4.4
char<- function(x){
stri_length(x) - stri_count_fixed(x," ")
}
File <- c("Twitter","Blogs", "News")
File_Size <- c(round(file.info(tw_file)$size/1024^2), round(file.info(bl_file)$size/1024^2) ,round(file.info(ne_file)$size/1024^2))
Lines<-c(length(twitter), length(blogs), length(news))
Words<-c(sum(stri_count_words(TwitterDt)), sum(stri_count_words(BlogsDT)), sum(stri_count_words(NewsDt)))
Characters <- c(sum(char(TwitterDt)), sum(char(BlogsDT)),sum(char(NewsDt)))
Word_Per_Line <- Words/Lines
data.frame(File, File_Size, Lines, Words, Characters, Word_Per_Line)
## File File_Size Lines Words Characters Word_Per_Line
## 1 Twitter 159 1 30093369 134082634 30093369
## 2 Blogs 200 1 37546246 170389662 37546246
## 3 News 196 1 2674536 13072698 2674536
TW_Sample<- sample(TwitterDt,1200)
BL_Sample <- sample(BlogsDT,1200)
NE_Sample<- sample(NewsDt,1200)
Data_Sample<-c(TW_Sample,BL_Sample,NE_Sample)
We clean the data to remove punctuations, special characters, numbers etc., and convert all letters to lower case letters.
We create a word cloud after cleaning to analyze the frequent words used in the data set.
library(stringi)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
set.seed(100)
head(Data_Sample,10)
## [1] "willy called to tell me I'm a pansy"
## [2] "Hey guys! Check out on twitter and on YouTube! Check out his gaming vids, and his other vids if you want! :D"
## [3] "Ok - who knows the best boards with ideas for family activities, games, and parenting? We're making a top10 list!"
## [4] "im doin homework >.<"
## [5] "Best Animated FEature Film goes to Rango... blah"
## [6] "hey its my sweet16 birthday 1"
## [7] "“If this situation is not addressed, and not addressed now with increased urgency, millions of children will continue to die..."
## [8] "That was a valid point. Thanks for sharing your opinion with us!"
## [9] "& remained around the initial asking price. But it'll turn into a long term investment like I had assumed. The company will grow"
## [10] "Meet with my attorney client this morning. Business has more than doubled since I started working with them. And we just got started!"
Data_Sample<-iconv(Data_Sample,"latin1","ASCII",sub="")
dtcorpus<-VCorpus(VectorSource(Data_Sample))
dtcorpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3600
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
docs<-tm_map(dtcorpus,removePunctuation)
#transform to lower case
docs<-tm_map(docs, removeNumbers)
docs<-tm_map(docs, content_transformer(tolower))
docs<-tm_map(docs, stripWhitespace)
docs<-tm_map(docs, PlainTextDocument)
wordcloud(docs,max.words=100,random.order=FALSE, colors=brewer.pal(8,"Dark2"))
writeLines(as.character(docs[[20]]))
## your credit score is you
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.4
TextData<-TermDocumentMatrix(docs)
m<-as.matrix(TextData)
v<-sort(rowSums(m), decreasing=TRUE)
d<-data.frame(word=names(v),freq=v)
head(d,10)
## word freq
## the the 5359
## and and 2716
## that that 1103
## for for 1085
## you you 821
## with with 750
## was was 679
## this this 596
## have have 592
## are are 531
barplot(d[1:10,]$freq, las=2, names.arg= d[1:10,]$word, col="light blue", main="Most Frequent Words", ylab="Word Frequencies")
library(RWeka)
Corpusa<-tm_map(docs,PlainTextDocument)
UnigramTokenizer<- function(x){NGramTokenizer(x, Weka_control(min=1, max=1))}
BigramTokenizer<- function(x){NGramTokenizer(x, Weka_control(min=2, max=2))}
TrigramTokenizer<- function(x){NGramTokenizer(x, Weka_control(min=3, max=3))}
Unigrams<-TermDocumentMatrix(Corpusa, control = list(tokenize = UnigramTokenizer))
Bigrams<-TermDocumentMatrix(Corpusa, control = list(tokenize = BigramTokenizer))
Trigrams<-TermDocumentMatrix(Corpusa, control = list(tokenize = TrigramTokenizer))
Unigrams
## <<TermDocumentMatrix (terms: 15209, documents: 3600)>>
## Non-/sparse entries: 71436/54680964
## Sparsity : 100%
## Maximal term length: 51
## Weighting : term frequency (tf)
Bigrams
## <<TermDocumentMatrix (terms: 67701, documents: 3600)>>
## Non-/sparse entries: 98964/243624636
## Sparsity : 100%
## Maximal term length: 59
## Weighting : term frequency (tf)
Trigrams
## <<TermDocumentMatrix (terms: 91997, documents: 3600)>>
## Non-/sparse entries: 97010/331092190
## Sparsity : 100%
## Maximal term length: 69
## Weighting : term frequency (tf)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
freq_frame <- function(xyz){
freq<-sort(rowSums(as.matrix(xyz)),decreasing=TRUE)
freq_frame<-data.frame(word=names(freq), freq=freq)
return(freq_frame)
}
UnigramsMostFreq<-removeSparseTerms(Unigrams,0.999)
UnigramsMostFreqSorted<-freq_frame(UnigramsMostFreq)
BigramsMostFreq<-removeSparseTerms(Bigrams,0.999)
BigramsMostFreqSorted<-freq_frame(BigramsMostFreq)
TrigramsMostFreq<-removeSparseTerms(Trigrams,0.999)
TrigramsMostFreqSorted<-freq_frame(TrigramsMostFreq)
UG<-ggplot(data=UnigramsMostFreqSorted[1:35,],aes(x=reorder(word, -freq),y=freq) )+geom_bar(stat="identity")
UG<-UG+labs(x="N-grams", y = "Frequency", title = "Frequencies of The Most Common Individual Words")+theme(axis.text.x=element_text(angle=90))
UG
BI<-ggplot(data=BigramsMostFreqSorted[1:35,],aes(x=reorder(word, -freq),y=freq))+geom_bar(stat="identity")
BI<-BI+labs(x="N-grams", y = "Frequency", title = "Frequencies of The Most Common Bigram Words")+theme(axis.text.x=element_text(angle=90))
BI
TRI<-ggplot(data=TrigramsMostFreqSorted[1:35,],aes(x=reorder(word, -freq),y=freq))+geom_bar(stat="identity")
TRI<-TRI+labs(x="N-grams", y = "Frequency", title = "Frequencies of The Most Common Trigram Words")+theme(axis.text.x=element_text(angle=90))
TRI
We investigate the N-Gram models for further analysis and creation of prediction algorithm and Shiny Apps. When looking at a string of words the most likely next word can be predicted. The frequency of word repetition is the most relevant feature to be used for prediction.