Setting up libraries

setwd("C:/Week2Assignment")
library(tm)
library(NLP)
library(ggplot2)
library(stringi)
library(dplyr)
library(RWeka)
library(wordcloud)

Initializing variables for Blogs, News and Twitters

File1 = file("en_US.blogs.txt")
blogs <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)

File1 = file("en_US.news.txt")
news <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)

File1 = file("en_US.twitter.txt")
twitter <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)

summary <- data.frame('File' = c("Blogs", "News", "Twitter"),
                      "File Size" = sapply(list(blogs, news, twitter),
                                           function(x){format(object.size(x),"MB")}),
                      'Rows' = sapply(list(blogs, news, twitter),
                                      function(x){length(x)}),
                      'Characters' = sapply(list(blogs, news, twitter),
                                      function(x){sum(nchar(x))}),
                      'MaxCharacters' = sapply(list(blogs, news, twitter),
                                      function(x){max(unlist(lapply(x,function(y) nchar(y))))})
                      )
summary
##      File File.Size    Rows Characters MaxCharacters
## 1   Blogs  255.4 Mb  899288  206824505         40833
## 2    News   19.8 Mb   77259   15639408          5760
## 3 Twitter    319 Mb 2360148  162096241           140

Sampling Of The Data

set.seed(12345)
sample_set <- c(sample(blogs, length(blogs) * 0.01),
                sample(news, length(news) * 0.01),
                sample(twitter, length(twitter) * 0.01))

summary_ss <- 
    data.frame('File' = "Sample Set",
               "File Size" = sapply(list(sample_set), 
                                    function(x){format(object.size(x),"MB")}),
               'Rows' = sapply(list(sample_set), 
                               function(x){length(x)}),
               'Characters' = sapply(list(sample_set), 
                                     function(x){sum(nchar(x))}),
               'MaxCharacters' = sapply(list(sample_set), 
                                        function(x){max(unlist(lapply(x,
                                                                      function(y) nchar(y))))})
    )

summary_ss
##         File File.Size  Rows Characters MaxCharacters
## 1 Sample Set      6 Mb 33365    3832109          2955

Data Cleanup Activity

Removing all puncuations, numbers, whitespaces and change all characters to lower case and plain text

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
testdata <- iconv(sample_set, "UTF-8", "ASCII", sub="")
corpus <- VCorpus(VectorSource(testdata))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Creating 3 tockenized matrix by Spliting strings into NGrams with minimal and maximal numbers of grams

Creating term document matrix

unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))

uniTDM <- TermDocumentMatrix(corpus, control=list(tokenize=unigram))
biTDM <- TermDocumentMatrix(corpus, control=list(tokenize=bigram))
triTDM <- TermDocumentMatrix(corpus, control=list(tokenize=trigram))

Find frequent terms in a document-term

uniTFF <- findFreqTerms(uniTDM, lowfreq = 50)
biTFF <- findFreqTerms(biTDM, lowfreq = 50)
triTFF <- findFreqTerms(triTDM, lowfreq = 10)

uni_freq <- rowSums(as.matrix(uniTDM[uniTFF, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)

bi_freq <- rowSums(as.matrix(biTDM[biTFF, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)

tri_freq <- rowSums(as.matrix(triTDM[triTFF, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)

head(uni_freq)
##                 words frequency
## able             able       197
## about           about      2213
## above           above       105
## absolutely absolutely        96
## access         access        53
## according   according        85
head(bi_freq)
##             words frequency
## a bad       a bad        50
## a better a better        50
## a big       a big       102
## a bit       a bit       184
## a chance a chance        55
## a couple a couple       124
head(tri_freq)
##                   words frequency
## a bit more   a bit more        17
## a bit of       a bit of        48
## a bit too     a bit too        10
## a bunch of   a bunch of        30
## a chance to a chance to        31
## a copy of     a copy of        12

Plotting N-Grams

Unigram Frequency(200 words)

wordcloud(words=uni_freq$words, freq=uni_freq$frequency, 
          max.words=200, colors = brewer.pal(7, "Dark2"), scale=c(10, .5))

Bigram Frequency(100 words)

wordcloud(words=bi_freq$words, freq=bi_freq$frequency, 
          max.words=100, colors = brewer.pal(7, "Dark2"), scale=c(7, .5))

Trigram Frequency (30 words)

wordcloud(words=tri_freq$words, freq=tri_freq$frequency, 
          max.words=30, colors = brewer.pal(7, "Dark2"), scale=c(5, .5))

Plots using Bar charts

One Word frequency - top 15

plotFrequency <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ], 
                aes(x = reorder(words, -frequency), y=frequency)) +
                geom_bar(stat="identity", fill="red") + 
                        theme(axis.text.x = element_text(angle = 90)) +
                ggtitle("One Word frequency  - Top 15") + 
                                        xlab("words") +  ylab("frequency")

OneWord <- plotFrequency
OneWord

Two Words frequency - top 15

plotFrequency <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ], 
                aes(x = reorder(words, -frequency), y=frequency)) +
                geom_bar(stat="identity", fill="green") + 
                        theme(axis.text.x = element_text(angle = 90)) + 
                ggtitle("Two Words frequcny - Top 15") + 
                                        xlab("words") +  ylab("frequency")

TwoWords <- plotFrequency
TwoWords

Three Words frequency - top 15

plotFrequency <- ggplot(data = tri_freq[order(-tri_freq$frequency),][1:15, ], 
                aes(x = reorder(words, -frequency), y=frequency)) +
                geom_bar(stat="identity", fill="blue") + 
                        theme(axis.text.x = element_text(angle = 90)) + 
                ggtitle("Three Words frequency - Top 15") + 
                                        xlab("words") +  ylab("frequency")
ThreeWords <- plotFrequency
ThreeWords

Conclusion

Above analysis concludes the initial exploratory analysis of the data. this will provide the base for future predictive algorithm that will use above techniques along with techniques like Shinyapp to predict the next word in the sentence the user is typing.