This the basic processing and analysis of blogs, news and twitter dataseta. The document presents creating corpora and preparing basid 1-gram, 2-gram, 3-gram and 4-gram summary with top 5 phrases for each category. In addition the same analysis was conducted ommitting english stopwords.
Loading necessary packaes. In order to execute the explanatory analyses of provided datasets the “tm” package wil be emplyed.
library(tm, quietly = TRUE)
library(ggplot2, quietly = TRUE, warn.conflicts = FALSE)
Setting connections to the files with Blog,News and Twitter data scrapper from the web.
conBlog <- file("../final/en_US/en_US.blogs.txt", "r")
conNews <- file("../final/en_US/en_US.news.txt", "r")
conTwit <- file("../final/en_US/en_US.twitter.txt", "r")
Reading full files into the memory. Encoding was selected to UTF-8 in oder to ensure proper scanning of non-latin characters. Warnings regarding the missing End Of Line (EOL) are disabled. NUL characters are not being read.
dtBlog <- readLines(conBlog, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
dtNews <- readLines(conNews, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
dtTwit <- readLines(conTwit, encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
All open conections are being imidiatly closed after reading the data.
close(conBlog)
close(conNews)
close(conTwit)
rm(conBlog)
rm(conNews)
rm(conTwit)
Presenting some basing data on the dataset. Size in memory and number of elements.
data.frame(
Name = c("Blogs", "News", "Twitter"),
Size = c(
format(object.size(dtBlog), units = "MB"),
format(object.size(dtNews), units = "MB"),
format(object.size(dtTwit), units = "MB")
),
Elements = c(length(dtBlog), length(dtNews), length(dtTwit))
)
## Name Size Elements
## 1 Blogs 255.4 Mb 899288
## 2 News 19.8 Mb 77259
## 3 Twitter 319 Mb 2360148
Selecting random 1% sample of each dataset to made a corpora because around .5 GB of data requires significant computational power.
set.seed(116)
dtBlogTr <- sample(dtBlog, length(dtBlog) * .01)
dtNewsTr <- sample(dtNews, length(dtNews) * .01)
dtTwitTr <- sample(dtTwit, length(dtTwit) * .01)
dtall <- c(dtBlogTr, dtNewsTr, dtTwitTr)
Releasing memory
rm(dtBlog)
rm(dtNews)
rm(dtTwit)
rm(dtBlogTr)
rm(dtNewsTr)
rm(dtTwitTr)
Creating corpora
allcorp <- VCorpus(VectorSource(dtall))
Releasing memory
rm(dtall)
Tokenisation: Removing numbers and punctuation as well as transformin all text to lower case letters. All remining whiespaces were removed. Creating separate corpora with all English stopwords removed.
nswcorp <- tm_map(allcorp, FUN = removeWords, stopwords("english"))
allcorp <- tm_map(allcorp, FUN = removeNumbers)
allcorp <- tm_map(allcorp, FUN = removePunctuation)
allcorp <- tm_map(allcorp, FUN = content_transformer(tolower))
allcorp <- tm_map(allcorp, FUN = stripWhitespace)
allcorp <- tm_map(allcorp, FUN = PlainTextDocument)
nswcorp <- tm_map(nswcorp, FUN = removeNumbers)
nswcorp <- tm_map(nswcorp, FUN = removePunctuation)
nswcorp <- tm_map(nswcorp, FUN = content_transformer(tolower))
nswcorp <- tm_map(nswcorp, FUN = stripWhitespace)
nswcorp <- tm_map(nswcorp, FUN = PlainTextDocument)
Creating unigram, bigram, trigram and tetragram tokenizers
create_unigram <- function(x) vapply(ngrams(strsplit(as.character(x), " ", fixed = TRUE)[[1L]], 1L), paste, "", collapse = " ")
create_bigram <- function(x) vapply(ngrams(strsplit(as.character(x), " ", fixed = TRUE)[[1L]], 2L), paste, "", collapse = " ")
create_trigram <- function(x) vapply(ngrams(strsplit(as.character(x), " ", fixed = TRUE)[[1L]], 3L), paste, "", collapse = " ")
create_tetragram <- function(x) vapply(ngrams(strsplit(as.character(x), " ", fixed = TRUE)[[1L]], 4L), paste, "", collapse = " ")
Creating tokenized term-document matrices for full corpora
allcorpUn <- TermDocumentMatrix(allcorp, control = list(tokenizer = create_unigram))
allcorpBi <- TermDocumentMatrix(allcorp, control = list(tokenizer = create_bigram))
allcorpTr <- TermDocumentMatrix(allcorp, control = list(tokenizer = create_trigram))
allcorpTe <- TermDocumentMatrix(allcorp, control = list(tokenizer = create_tetragram))
Creating tokenized term-document matrices for corpora without english stopwords
nswcorpUn <- TermDocumentMatrix(nswcorp, control = list(tokenizer = create_unigram))
nswcorpBi <- TermDocumentMatrix(nswcorp, control = list(tokenizer = create_bigram))
nswcorpTr <- TermDocumentMatrix(nswcorp, control = list(tokenizer = create_trigram))
nswcorpTe <- TermDocumentMatrix(nswcorp, control = list(tokenizer = create_tetragram))
Creating most frequent unigrams, bigrams, trigrams and tetragrams with stopwords
allUn <- findFreqTerms(allcorpUn, lowfreq = 5)
allBi <- findFreqTerms(allcorpBi, lowfreq = 5)
allTr <- findFreqTerms(allcorpTr, lowfreq = 5)
allTe <- findFreqTerms(allcorpTe, lowfreq = 5)
Creating most frequent unigrams, bigrams, trigrams and tetragrams without stopwords
nswUn <- findFreqTerms(nswcorpUn, lowfreq = 5)
nswBi <- findFreqTerms(nswcorpBi, lowfreq = 5)
nswTr <- findFreqTerms(nswcorpTr, lowfreq = 5)
nswTe <- findFreqTerms(nswcorpTe, lowfreq = 5)
Creating summary function
summarise_n_grams <- function(tdm, ft)
data.frame(words=names(rowSums(as.matrix(tdm[ft, ]))), frequency=rowSums(as.matrix(tdm[ft, ])))
Creating summary of most frequent unigrams, bigrams, trigrams and tetragrams with stopwords
allUnFreq <- summarise_n_grams(allcorpUn, allUn)
allBiFreq <- summarise_n_grams(allcorpBi, allBi)
allTrFreq <- summarise_n_grams(allcorpTr, allTr)
allTeFreq <- summarise_n_grams(allcorpTe, allTe)
Creating summary of most frequent unigrams, bigrams, trigrams and tetragrams without stopwords
nswUnFreq <- summarise_n_grams(nswcorpUn, nswUn)
nswBiFreq <- summarise_n_grams(nswcorpBi, nswBi)
nswTrFreq <- summarise_n_grams(nswcorpTr, nswTr)
nswTeFreq <- summarise_n_grams(nswcorpTe, nswTe)
ggplot(data = allUnFreq[order(-allUnFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = allBiFreq[order(-allBiFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = allTrFreq[order(-allTrFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = allTeFreq[order(-allTeFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = nswUnFreq[order(-nswUnFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = nswBiFreq[order(-nswBiFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = nswTrFreq[order(-nswTrFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
ggplot(data = nswTeFreq[order(-nswTeFreq$frequency),][1:5, ], aes(x = reorder(words, -frequency), y=frequency)) + geom_bar(stat="identity")
Next step is to biuld prediction models, probaby SVMs.