Initializing variables for Blogs, News and Twitters
File1 = file("en_US.blogs.txt")
blogs <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)
File1 = file("en_US.news.txt")
news <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)
File1 = file("en_US.twitter.txt")
twitter <- readLines(File1, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(File1)
summary <- data.frame('File' = c("Blogs", "News", "Twitter"),
"File Size" = sapply(list(blogs, news, twitter),
function(x){format(object.size(x),"MB")}),
'Rows' = sapply(list(blogs, news, twitter),
function(x){length(x)}),
'Characters' = sapply(list(blogs, news, twitter),
function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(blogs, news, twitter),
function(x){max(unlist(lapply(x,function(y) nchar(y))))})
)
summary
## File File.Size Rows Characters MaxCharacters
## 1 Blogs 255.4 Mb 899288 206824505 40833
## 2 News 19.8 Mb 77259 15639408 5760
## 3 Twitter 319 Mb 2360148 162096241 140
Sampling Of The Data
set.seed(12345)
sample_set <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
summary_ss <-
data.frame('File' = "Sample Set",
"File Size" = sapply(list(sample_set),
function(x){format(object.size(x),"MB")}),
'Rows' = sapply(list(sample_set),
function(x){length(x)}),
'Characters' = sapply(list(sample_set),
function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(sample_set),
function(x){max(unlist(lapply(x,
function(y) nchar(y))))})
)
summary_ss
## File File.Size Rows Characters MaxCharacters
## 1 Sample Set 6 Mb 33365 3832109 2955
Data Cleanup Activity
Removing all puncuations, numbers, whitespaces and change all characters to lower case and plain text
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
testdata <- iconv(sample_set, "UTF-8", "ASCII", sub="")
corpus <- VCorpus(VectorSource(testdata))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Creating 3 tockenized matrix by Spliting strings into NGrams with minimal and maximal numbers of grams
Creating term document matrix
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
uniTDM <- TermDocumentMatrix(corpus, control=list(tokenize=unigram))
biTDM <- TermDocumentMatrix(corpus, control=list(tokenize=bigram))
triTDM <- TermDocumentMatrix(corpus, control=list(tokenize=trigram))
Find frequent terms in a document-term
uniTFF <- findFreqTerms(uniTDM, lowfreq = 50)
biTFF <- findFreqTerms(biTDM, lowfreq = 50)
triTFF <- findFreqTerms(triTDM, lowfreq = 10)
uni_freq <- rowSums(as.matrix(uniTDM[uniTFF, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)
bi_freq <- rowSums(as.matrix(biTDM[biTFF, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)
tri_freq <- rowSums(as.matrix(triTDM[triTFF, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)
head(uni_freq)
## words frequency
## able able 197
## about about 2213
## above above 105
## absolutely absolutely 96
## access access 53
## according according 85
head(bi_freq)
## words frequency
## a bad a bad 50
## a better a better 50
## a big a big 102
## a bit a bit 184
## a chance a chance 55
## a couple a couple 124
head(tri_freq)
## words frequency
## a bit more a bit more 17
## a bit of a bit of 48
## a bit too a bit too 10
## a bunch of a bunch of 30
## a chance to a chance to 31
## a copy of a copy of 12
Plotting N-Grams
Unigram Frequency(200 words)
wordcloud(words=uni_freq$words, freq=uni_freq$frequency,
max.words=200, colors = brewer.pal(7, "Dark2"), scale=c(10, .5))

Bigram Frequency(100 words)
wordcloud(words=bi_freq$words, freq=bi_freq$frequency,
max.words=100, colors = brewer.pal(7, "Dark2"), scale=c(7, .5))

Trigram Frequency (30 words)
wordcloud(words=tri_freq$words, freq=tri_freq$frequency,
max.words=30, colors = brewer.pal(7, "Dark2"), scale=c(5, .5))

Plots using Bar charts
One Word frequency - top 15
plotFrequency <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:15, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="red") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("One Word frequency - Top 15") +
xlab("words") + ylab("frequency")
OneWord <- plotFrequency
OneWord

Two Words frequency - top 15
plotFrequency <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:15, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="green") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Two Words frequcny - Top 15") +
xlab("words") + ylab("frequency")
TwoWords <- plotFrequency
TwoWords

Three Words frequency - top 15
plotFrequency <- ggplot(data = tri_freq[order(-tri_freq$frequency),][1:15, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="blue") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Three Words frequency - Top 15") +
xlab("words") + ylab("frequency")
ThreeWords <- plotFrequency
ThreeWords
