In the endevaour of creating a prediction algorithm, from the given set of data, my first step would be to download the dataset (Datasource)

The files are downloaded and unzipped and saved in my working directory

Before getting into details, it is fair to have a overall view on the files downloaded.

library(rJava)
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(RWekajars)
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tm)
library(stringi)
library(NLP)
library(RColorBrewer)
library(wordcloud)
library(ngram)
library(slam)
library(htmlTable)
library(xtable)
Path1<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.blogs.txt"
Path2<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.news.txt"
Path3<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.twitter.txt"
blogs <- readLines(Path1, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(Path2, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(Path3, encoding = "UTF-8", skipNul = TRUE)
Stats_Rawfile<-data.frame(FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
                          FileSizeinMB=c(file.info(Path1)$size/1024^2,
                                          file.info(Path2)$size/1024^2,
                                          file.info(Path3)$size/1024^2),
                          NumberofLines=sapply(list(blogs,news,twitter),length),
                          Numberofwords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,])
kable(Stats_Rawfile,caption="Stats of Raw File")
Stats of Raw File
FileName FileSizeinMB NumberofLines Numberofwords
en_US.blogs 200.4242 899288 37570839
en_US.news 196.2775 77259 2651432
en_US.twitter 159.3641 2360148 30451170

Since the size of the file is huge in terms of memory and number of lines, it is imperative to create a sample out of these 3 files and also create a corpus by aggregating all these 3 sample files. It is through various iteration that I found a sample file with 0.2% of the original size/length is optimal.

Blogs_subset <- sample(blogs, length(blogs) * 0.002)
News_subset <- sample(news, length(news) * 0.002)
twitter_subset <- sample(twitter, length(twitter) * 0.002)
total_subset<-c(Blogs_subset,News_subset,twitter_subset)
stats_all<-data.frame(FileName=c("Sample_blog","sample_news","sample_twitter","sample_total"),
                          FileSizeinMB=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset), function(x){format(object.size(x),"MB")}),
                          NumberofLines=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),length),
                          Numberofwords=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),stri_stats_latex)[4,])
kable(stats_all,caption="Stats of all sample files")  
Stats of all sample files
FileName FileSizeinMB NumberofLines Numberofwords
Sample_blog 0.5 Mb 1798 77074
sample_news 0 Mb 154 5018
sample_twitter 0.6 Mb 4720 60378
sample_total 1.1 Mb 6672 142470

After reducing the size now its time to clean the data, by removing punctuations, special characters, white space, profane words.

Blogs_subset <- iconv(Blogs_subset, "UTF-8", "ASCII", sub="")
News_subset <- iconv(News_subset, "UTF-8", "ASCII", sub="")
twitter_subset <- iconv(twitter_subset, "UTF-8", "ASCII", sub="")
total_subset<-c(Blogs_subset,News_subset,twitter_subset)

building.corpus <- function (x) {
  corpus <- VCorpus(VectorSource(x))
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, PlainTextDocument)
}
corpus.final<-building.corpus(total_subset)

Now we get into the tokenizing process

uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpus.uni.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = uni_tokenizer))
corpus.uni <- findFreqTerms(corpus.uni.matrix,lowfreq = 10)
corpus.uni.f <- rowSums(as.matrix(corpus.uni.matrix[corpus.uni,]))
corpus.uni.f <- data.frame(word=names(corpus.uni.f), frequency=corpus.uni.f)
kable(head(corpus.uni.f),caption = "Only one word")
Only one word
word frequency
ability ability 13
able able 43
about about 365
above above 23
absolutely absolutely 16
according according 14
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus.bi.matrix<- TermDocumentMatrix(corpus.final, control = list(tokenize = bi_tokenizer))
corpus.bi <- findFreqTerms(corpus.bi.matrix,lowfreq=10)
corpus.bi.f <- rowSums(as.matrix(corpus.bi.matrix[corpus.bi,]))
corpus.bi.f <- data.frame(word=names(corpus.bi.f), frequency=corpus.bi.f)
kable(head(corpus.bi.f),caption = "Two words")
Two words
word frequency
a bad a bad 11
a beautiful a beautiful 14
a better a better 10
a big a big 21
a bit a bit 37
a book a book 16
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus.tri.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = tri_tokenizer))
corpus.tri <- findFreqTerms(corpus.tri.matrix,lowfreq=10)
corpus.tri.f <- rowSums(as.matrix(corpus.tri.matrix[corpus.tri,]))
corpus.tri.f <- data.frame(word=names(corpus.tri.f), frequency=corpus.tri.f)
kable(head(corpus.tri.f),caption = "Three words")
Three words
word frequency
a bit of a bit of 11
a couple of a couple of 15
a lot of a lot of 50
and i will and i will 10
and of course and of course 10
as long as as long as 10

Now we calculate the frequencies of N-Grams

plot.n.grams <- function(data, title, num) {
  df2 <- data[order(-data$frequency),][1:num,] 
  ggplot(df2, aes(x = seq(1:num), y = frequency)) +
    geom_bar(stat = "identity", fill = "darkgreen", colour = "black", width = 1.1) +
    coord_cartesian(xlim = c(0, num+1)) +
    labs(title = title) +
    xlab("Words") +
    ylab("Count") +
    scale_x_discrete(breaks = seq(1, num, by = 1), labels = df2$word[1:num]) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

 U<-plot.n.grams(corpus.uni.f,"Unigrams",20)
 B<-plot.n.grams(corpus.bi.f,"Bigrams",20)
 Tr<-plot.n.grams(corpus.tri.f,"Trigrams",20)
gridExtra::grid.arrange(U, B, Tr, ncol = 3)

The same could be viewed alternatively in WORDCLOUD

corpus.cloud<-list(corpus.tri.f,corpus.bi.f,corpus.uni.f)
par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(corpus.cloud[[i]]$word, corpus.cloud[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Now that we have the ground work done, the nextlogical step would be to be build predition algorithm using shiny app.