usTwitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
usBlogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
usNews <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'final/en_US/en_US.news.txt'
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
sumar <- data.frame(Data.Source = c("usBlogs", "usNews", "usTwitter"),
File.size.in.MB = c(as.numeric(object.size(usBlogs)/(1024^2)),
as.numeric(object.size(usNews)/(1024^2)),
as.numeric(object.size(usTwitter)/(1024^2))),
WordCount=sapply(list(usBlogs, usNews, usTwitter), stri_stats_latex)[4,],
t(rbind(sapply(list(usBlogs, usNews, usTwitter), stri_stats_general)[c('Lines','Chars'),]
)))
sumar
## Data.Source File.size.in.MB WordCount Lines Chars
## 1 usBlogs 255.35453 37570839 899288 206824382
## 2 usNews 19.76917 2651432 77259 15639408
## 3 usTwitter 318.98975 30451170 2360148 162096241
Now we are going to perform a basic operation to clean and subsample data into our training set
set.seed(155)
testing <- c(sample(usBlogs, length(usBlogs) * 0.01),
sample(usNews, length(usNews) * 0.01),
sample(usTwitter, length(usTwitter) * 0.01)
)
testdata <- iconv(testing, "UTF-8", "ASCII", sub="")
sample <- VCorpus(VectorSource(testdata))
sample <- tm_map(sample, tolower)
sample <- tm_map(sample, stripWhitespace)
sample <- tm_map(sample, removePunctuation)
sample <- tm_map(sample, removeNumbers)
sample <- tm_map(sample, PlainTextDocument)
With Rweka package we are going to find the most popular combination of words
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
unidtf <- TermDocumentMatrix(sample, control=list(tokenize=unigram))
uni_tf <- findFreqTerms(unidtf, lowfreq = 50 )
uni_freq <- rowSums(as.matrix(unidtf[uni_tf, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
bidtf <- TermDocumentMatrix(sample, control=list(tokenize=bigram))
bi_tf <- findFreqTerms(bidtf, lowfreq = 50 )
bi_freq <- rowSums(as.matrix(bidtf[bi_tf, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
tridtf <- TermDocumentMatrix(sample, control=list(tokenize=trigram))
tri_tf <- findFreqTerms(tridtf, lowfreq = 10 )
tri_freq <- rowSums(as.matrix(tridtf[tri_tf, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)
Now using the worldcloud package and ggplot2 we will plot our results,
wordcloud(words=uni_freq$words, freq=uni_freq$frequency, max.words=100, colors = c(1:5))
wordcloud(words=bi_freq$words, freq=bi_freq$frequency, max.words=100, colors = c(1:5))
wordcloud(words=tri_freq$words, freq=tri_freq$frequency, max.words=100, colors = c(1:5))
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i dont
## know could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : you
## have to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : is
## going to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : some
## of the could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : thank
## you for could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : one of
## the could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i have
## to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : part
## of the could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : this
## is a could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : would
## like to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : when i
## was could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i love
## you could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : if you
## are could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : it
## would be could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : to see
## you could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : it was
## a could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : let me
## know could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : is one
## of could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : if you
## dont could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : had a
## great could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : one of
## those could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : as
## well as could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : that i
## have could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : back
## to the could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i have
## a could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i
## think its could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : be
## able to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : want
## to be could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : what
## do you could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : out of
## the could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : wish i
## could could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : happy
## mothers day could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : a
## couple of could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : i want
## to could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : the
## first time could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : in the
## world could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : you
## have a could not be fit on page. It will not be plotted.
plot_freq_1 <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Top Words") + xlab("w") + ylab("f")
plot_freq_2 <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="orange") + theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Bigrams") + xlab("w") + ylab("f")
plot_freq_3 <- ggplot(data = tri_freq[order(-tri_freq$frequency),][1:15, ], aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="purple") + theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Trigrams") + xlab("w") + ylab("f")
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
multiplot(plot_freq_1, plot_freq_2, plot_freq_3, cols=2)