Loading Libraries

library(doParallel) library(tm) library(stringi) library(RWeka) library(dplyr) library(kableExtra) library(SnowballC) library(ggplot2)

Setting up doParallel

library(doParallel) set.seed(613) n_cores <- detectCores() - 2
registerDoParallel(n_cores,cores=n_cores)

Show files used

directory_us <- file.path(“.”, “data”, “final”, “en_US/”) dir(directory_us)

#Loading Files and show summaries blogs_con <- file(paste0(directory_us, “/en_US.blogs.txt”), “r”) blogs <- readLines(blogs_con, encoding=“UTF-8”, skipNul = TRUE) close(blogs_con)

news_con <- file(paste0(directory_us, “/en_US.news.txt”), “r”) news <- readLines(news_con, encoding=“UTF-8”, skipNul = TRUE) close(news_con)

twitter_con <- file(paste0(directory_us, “/en_US.twitter.txt”), “r”) twitter <- readLines(twitter_con, encoding=“UTF-8”, skipNul = TRUE) close(twitter_con)

Create stats of files

WPL <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c(‘Min.’,‘Mean’,‘Max.’)]) rownames(WPL) <- c(‘WPL_Min’,‘WPL_Mean’,‘WPL_Max’) rawstats <- data.frame( File = c(“blogs”,“news”,“twitter”), t(rbind(sapply(list(blogs,news,twitter),stri_stats_general), TotalWords = sapply(list(blogs,news,twitter),stri_stats_latex)[4,], WPL)) ) # Show stats in table kable(rawstats) %>% kable_styling(bootstrap_options = c(“striped”, “hover”))

# Sample of data set.seed(613) data.sample <- c(sample(blogs, length(blogs) * 0.01), sample(news, length(news) * 0.01), sample(twitter, length(twitter) * 0.01)) saveRDS(data.sample, ‘sample.rds’)

Cleaning up a other object we do not use anymore.

rm(blogs, blogs_con, data.sample, directory_us, news, news_con, rawstats, twitter, twitter_con, WPL)

Load the RDS file

data <- readRDS(“sample.rds”) # Create a Corpus docs <- VCorpus(VectorSource(data)) # Remove data we do not need docs <- tm_map(docs, tolower) docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, removeWords, stopwords(“english”)) # Do stamming docs <- tm_map(docs, stemDocument) # Strip whitespaces docs <- tm_map(docs, stripWhitespace)

Create Tokenization funtions

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1)) bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

Create plain text format

docs <- tm_map(docs, PlainTextDocument)

Create TermDocumentMatrix with Tokenizations and Remove Sparse Terms

tdm_freq1 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = unigram)), 0.9999) tdm_freq2 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = bigram)), 0.9999) tdm_freq3 <- removeSparseTerms(TermDocumentMatrix(docs, control = list(tokenize = trigram)), 0.9999)

Create frequencies

uni_freq <- sort(rowSums(as.matrix(tdm_freq1)), decreasing=TRUE) bi_freq <- sort(rowSums(as.matrix(tdm_freq2)), decreasing=TRUE) tri_freq <- sort(rowSums(as.matrix(tdm_freq3)), decreasing=TRUE)

Create DataFrames

uni_df <- data.frame(term=names(uni_freq), freq=uni_freq)
bi_df <- data.frame(term=names(bi_freq), freq=bi_freq)
tri_df <- data.frame(term=names(tri_freq), freq=tri_freq)

Show head 10 of unigrams

kable(head(uni_df,10))%>% kable_styling(bootstrap_options = c(“striped”, “hover”)) # Plot head 20 of unigrams head(uni_df,20) %>% ggplot(aes(reorder(term,-freq), freq)) + geom_bar(stat = “identity”) + ggtitle(“20 Most Unigrams”) + xlab(“Unigrams”) + ylab(“Frequency”) + theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1))

Show head 10 of bigrams

kable(head(bi_df,10))%>% kable_styling(bootstrap_options = c(“striped”, “hover”)) # Plot head 20 of bigrams head(bi_df,20) %>% ggplot(aes(reorder(term,-freq), freq)) + geom_bar(stat = “identity”) + ggtitle(“20 Most Bigrams”) + xlab(“Bigrams”) + ylab(“Frequency”) + theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1))

Show head 10 of trigrams

kable(head(tri_df,10))%>% kable_styling(bootstrap_options = c(“striped”, “hover”)) # Plot head 20 of trigrams head(tri_df,20) %>% ggplot(aes(reorder(term,-freq), freq)) + geom_bar(stat = “identity”) + ggtitle(“20 Most Trigrams”) + xlab(“Trigrams”) + ylab(“Frequency”) + theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1))