Exploratory Phase

Loading the data

usTwitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
usBlogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
usNews <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'final/en_US/en_US.news.txt'

Loading the required libraries

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(stringi)
library(tm)

## Warning: package 'tm' was built under R version 3.6.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(RWeka)

## Warning: package 'RWeka' was built under R version 3.6.3

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.3

## Loading required package: RColorBrewer

Summarize Data

sumar <- data.frame(Data.Source = c("usBlogs", "usNews", "usTwitter"),
                    File.size.in.MB = c(as.numeric(object.size(usBlogs)/(1024^2)),
                                        as.numeric(object.size(usNews)/(1024^2)),
                                        as.numeric(object.size(usTwitter)/(1024^2))),
                    WordCount=sapply(list(usBlogs, usNews, usTwitter), stri_stats_latex)[4,], 
                    t(rbind(sapply(list(usBlogs, usNews, usTwitter), stri_stats_general)[c('Lines','Chars'),]
                    )))

sumar

##   Data.Source File.size.in.MB WordCount   Lines     Chars
## 1     usBlogs       255.35453  37570839  899288 206824382
## 2      usNews        19.76917   2651432   77259  15639408
## 3   usTwitter       318.98975  30451170 2360148 162096241

Clean and sample the data

Now we are going to perform a basic operation to clean and subsample data into our training set

set.seed(155)
testing <- c(sample(usBlogs, length(usBlogs) * 0.01),
              sample(usNews, length(usNews) * 0.01),
              sample(usTwitter, length(usTwitter) * 0.01)
          )
          
testdata <- iconv(testing, "UTF-8", "ASCII", sub="")
sample <- VCorpus(VectorSource(testdata))
sample <- tm_map(sample, tolower)
sample <- tm_map(sample, stripWhitespace)
sample <- tm_map(sample, removePunctuation)
sample <- tm_map(sample, removeNumbers)
sample <- tm_map(sample, PlainTextDocument)

Finding the most popular words

With Rweka package we are going to find the most popular combination of words

unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
unidtf <- TermDocumentMatrix(sample, control=list(tokenize=unigram))
uni_tf <- findFreqTerms(unidtf, lowfreq = 50 )
uni_freq <- rowSums(as.matrix(unidtf[uni_tf, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)


bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
bidtf <- TermDocumentMatrix(sample, control=list(tokenize=bigram))
bi_tf <- findFreqTerms(bidtf, lowfreq = 50 )
bi_freq <- rowSums(as.matrix(bidtf[bi_tf, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)



trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tridtf <- TermDocumentMatrix(sample, control=list(tokenize=trigram))
tri_tf <- findFreqTerms(tridtf, lowfreq = 10 )
tri_freq <- rowSums(as.matrix(tridtf[tri_tf, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)

Plotting the findings

Now using the worldcloud package and ggplot2 we will plot our results,

wordcloud(words=uni_freq$words, freq=uni_freq$frequency, max.words=100, colors = c(1:5))

wordcloud(words=bi_freq$words, freq=bi_freq$frequency, max.words=100, colors = c(1:5))

wordcloud(words=tri_freq$words, freq=tri_freq$frequency, max.words=100, colors = c(1:5))

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i dont
## know could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : you
## have to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : is
## going to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : some
## of the could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : thank
## you for could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : one of
## the could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i have
## to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : part
## of the could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : this
## is a could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : would
## like to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : when i
## was could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i love
## you could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : if you
## are could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : it
## would be could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : to see
## you could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : it was
## a could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : let me
## know could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : is one
## of could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : if you
## dont could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : had a
## great could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : one of
## those could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : as
## well as could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : that i
## have could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : back
## to the could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i have
## a could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i
## think its could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : be
## able to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : want
## to be could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : what
## do you could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : out of
## the could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : wish i
## could could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : happy
## mothers day could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : a
## couple of could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i want
## to could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : the
## first time could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : in the
## world could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : you
## have a could not be fit on page. It will not be plotted.

plot_freq_1 <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
              geom_bar(stat="identity", fill="blue") + 
              ggtitle("Top Words") + xlab("w") +  ylab("f")

plot_freq_2 <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
  geom_bar(stat="identity", fill="orange") + theme(axis.text.x = element_text(angle = 45)) + 
  ggtitle("Bigrams") + xlab("w") +  ylab("f")

plot_freq_3 <- ggplot(data = tri_freq[order(-tri_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
  geom_bar(stat="identity", fill="purple") + theme(axis.text.x = element_text(angle = 45)) + 
  ggtitle("Trigrams") + xlab("w") +  ylab("f")

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

multiplot(plot_freq_1, plot_freq_2, plot_freq_3, cols=2)