Setup work environment
library(tm)
library(stringi)
library(wordcloud)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(knitr)
library(RWeka)
options(digits = 2)
Look at data sets
l1<-length(en_blog)
l2<-length(en_twit)
l3<-length(en_news)
#object.size(en_blog)
s1<-as.double(round(file.info("en_US.blogs.txt")$size/2^20,2))
s2<-as.double(round(file.info("en_US.twitter.txt")$size/2^20,2))
s3<-as.double(round(file.info("en_US.news.txt")$size/2^20,2))
w1<-sum(stri_count_words(en_blog))
w2<-sum(stri_count_words(en_twit))
w3<-sum(stri_count_words(en_news))
b0<-c("Number of LINES","FileSize in MB","Number of WORDS")
b1<-c(l1,s1,w1)
b2<-c(l2,s2,w2)
b3<-c(l3,s3,w3)
tbl<-as.table(rbind(Table=b0,Blog=b1,Twit=b2,News=b3))
kable(tbl)
| Table |
Number of LINES |
FileSize in MB |
Number of WORDS |
| Blog |
899288 |
200.42 |
38154238 |
| Twit |
2360148 |
159.36 |
30218166 |
| News |
77259 |
196.28 |
2693898 |
Large data sets - Get sample of datasets (6,000 long) for easier handling is future work
set.seed(123)
blogsample<-sample(en_blog,6000)
twitsample<-sample(en_twit,6000)
newssample<-sample(en_news,6000)
length(blogsample)
## [1] 6000
length(twitsample)
## [1] 6000
length(newssample)
## [1] 6000
Data Cleaning - remove unwanted characters
blogclean<- iconv(blogsample, 'UTF-8', 'ASCII', "byte")
twitclean<- iconv(twitsample, 'UTF-8', 'ASCII', "byte")
newsclean<- iconv(newssample, 'UTF-8', 'ASCII', "byte")
Combine data sets into one data set
totalclean<-c(blogclean,twitclean,newsclean)
Make Corpus and convert to lower case then remove punctuations then remove digits then remove whitespace then force to plaintext
corpus<-Corpus(VectorSource(as.data.frame(totalclean,stringsAsFactors = FALSE)))
corpus<- tm_map(corpus, tolower)
corpus<- tm_map(corpus, removePunctuation)
corpus<- tm_map(corpus, removeNumbers)
corpus<- tm_map(corpus, stripWhitespace)
corpus<- tm_map(corpus, PlainTextDocument)
Just for fun, play with wordcloud (I am not sure why everyone does this but it looks cool!)
wc1 <- TermDocumentMatrix(corpus)
wc2<- as.matrix(wc1)
wc3 <- sort(rowSums(wc2), decreasing=TRUE)
dm <- data.frame(word=names(wc3), freq=wc3)
wordcloud(dm$word, dm$freq, min.freq= 200, max.words = 500,random.order=TRUE, colors=brewer.pal(6, "Dark2"))

N-grams of texts are extensively used in text mining and natural language processing tasks. They are basically a set of co-occuring words within a given window - first I will use RWeka to create ngrams with NGramTokenizer
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
fourgram<- NGramTokenizer(corpus, Weka_control(min = 4, max = 4))
As an example plot frequency of top 10 fourgrams
fourgramdf <- tbl_df(as.data.frame((table(fourgram))))
fourgramdf <- fourgramdf[order(fourgramdf$Freq, decreasing = TRUE),]
ggplot(fourgramdf[1:10,], aes(x=fourgram, y=Freq)) +
#stat="Identity", fill="#0047AB"
geom_bar(stat = "Identity", fill="red")+
xlab("fourgrams") + ylab("Frequency")+
ggtitle("Top ten fourgrams") +
theme(axis.text.x=element_text(angle=80, hjust=1))

Summary
I am working with a sample data set since the whole data sets are large and take too much time. Using n-grams fequency’s I could predict the next word being typed. This seems like a good application for machine learning algorithms - that is for future assignments. It will be important to develop a “dictionary” of n-grams and then separate out the last work from each n-gram (thus a fourgram would become a pair trigram/unigram and this “pair” would be stored in the dictionary). Thus when an algorithm is developed it will allow the user to enter text and the algorithm will use the frequency of the “pairs” in the dictionary as the basis to predict the next word to be typed.