Loading packages and data

library(tidyverse)
library(stringr)
library(qdap)
library(wordcloud)
library(tm)
library(caret)
library(RWeka)
library(readr)
library(slam)

Summary Statistics

Without using Hadoop or MapReduce, it is impossible to figure out the word counts using R. However, we can approximate using average character per words in english is about 5 + 1 whitespace each. This is a workaround for users without big data analytics experience.

blogs_size <- file.info("./en_US/en_US.blogs.txt")$size/10^6
con <- file("./en_US/en_US.blogs.txt")
temp <- read_lines(con)
blogs_line <- length(temp)
blogs_word <- sum(nchar(temp))/6
sampleblogs <- sample(temp, 10000)
rm(temp)

twitter_size <- file.info("./en_US/en_US.twitter.txt")$size/10^6
con <- file("./en_US/en_US.twitter.txt")
temp <- read_lines(con)
twitter_line <- length(temp)
twitter_word <- sum(nchar(temp))/6
sampletwitter <- sample(temp, 10000)
rm(temp)

news_size <- file.info("./en_US/en_US.news.txt")$size/10^6
con <- file("./en_US/en_US.news.txt")
temp <- read_lines(con)
news_line <- length(temp)
news_word <- sum(nchar(temp))/6
samplenews <- sample(temp, 10000)
rm(temp)

tab <- data_frame(name = c("blogs","twitter","news"), 
                  size_mb = c(blogs_size, twitter_size, news_size),
                  lines_count = c(blogs_line, twitter_line, news_line),
                  word_count = c(blogs_word, twitter_word, news_word))
tab
## # A tibble: 3 x 4
##      name  size_mb lines_count word_count
##     <chr>    <dbl>       <int>      <dbl>
## 1   blogs 210.1600      899288   34470751
## 2 twitter 167.1053     2360148   27016005
## 3    news 205.8119     1010242   33870527

Loading dataset and create a sample

Because of limited RAM, we will take a small samples from blogs, twitter and news and create our small dataset called all_sample. Using 10000 observations from each (30000 observations in total) would be sufficient for building our first trial models.

Before continue, we will save the dataset for later use and remove all files from workspace to save memory.

all_samples <- c(sampleblogs, sampletwitter, samplenews)
write_lines(all_samples, "./all_samples.txt")
rm(list = ls())

Cleaning Datasets

##Cleaning latin characters and such
odd_clean <- function(x){
    x <- iconv(x, "latin1", "ASCII", sub = "")
    return(x)
}
##Transform all to text 
qdap_clean <- function(x){
  x <- replace_abbreviation(x)
  x <- replace_contraction(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x)
  x <- tolower(x)
  return(x)
}
##Download bad_words library
URL <-"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
download.file(URL, destfile = "bad_words.txt")
bad_words <- read_lines("bad_words.txt")

##Remove stopwords, bad words, punctuation, numbers and whitespaces. 
tm_clean <- function(corpus){
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, stripWhitespace)
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, removeWords, c(stopwords("en"),bad_words))
    return(corpus)
}

Bad words library borrowed at https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en

all_samples <- read_lines("./all_samples.txt")
all_samples <- sample(all_samples, size = length(all_samples))
all_samples <- odd_clean(all_samples)
all_samples <- qdap_clean(all_samples)
sample_corpus <- VCorpus(VectorSource(all_samples))
sample_corpus <- tm_clean(sample_corpus)

Features Extraction

We will now create TDM and start extracting features from dataset.

rm(bad_words); rm(odd_clean); rm(qdap_clean); rm(tm_clean); rm(URL); rm(all_samples)
sample_tdm <- TermDocumentMatrix(sample_corpus)

1-gram Tokenizer

sample_freq <- row_sums(sample_tdm)
wordcloud(names(sample_freq),sample_freq,max.words = 50, color= brewer.pal(4, "Paired"))

top25_1gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_1gram <- rownames_to_column(as.data.frame(top25_1gram))
names(top25_1gram) <- c("word","count")
ggplot(top25_1gram, aes(x = word, y = count)) + geom_col(fill = "lightblue3", col = "black") + coord_flip() + scale_x_discrete(limits = top25_1gram$word)

2-gram Tokenizer

rm(sample_freq); rm(sample_tdm)
##Create bigram tokenizer options
tokenizer <- function(x){
  NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
sample_tdm <- TermDocumentMatrix(sample_corpus, control = list(tokenize = tokenizer))
sample_freq <- row_sums(sample_tdm, na.rm = TRUE)
wordcloud(names(sample_freq),sample_freq,max.words = 25, color= brewer.pal(5, "Set2"))

top25_2gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_2gram <- rownames_to_column(as.data.frame(top25_2gram))
names(top25_2gram) <- c("word","count")
ggplot(top25_2gram, aes(x = word, y = count)) + geom_col(fill = "coral2", col = "black") + coord_flip() + scale_x_discrete(limits = top25_2gram$word)

3-gram Tokenizer

rm(sample_freq); rm(sample_tdm)
##Create bigram tokenizer options
tokenizer <- function(x){
  NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
sample_tdm <- TermDocumentMatrix(sample_corpus, control = list(tokenize = tokenizer))
sample_freq <- row_sums(sample_tdm, na.rm = TRUE)
wordcloud(names(sample_freq),sample_freq,max.words = 25, color= brewer.pal(5, "Dark2"))

top25_3gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_3gram <- rownames_to_column(as.data.frame(top25_3gram))
names(top25_3gram) <- c("word","count")
ggplot(top25_3gram, aes(x = word, y = count)) + geom_col(fill = "darkblue", col = "black") + coord_flip() + scale_x_discrete(limits = top25_3gram$word)

Findings

  1. The files are very massive for R since R uses RAM for everything. Thus, taking sample is crucial.
  2. Using verious packages makes it much easier (tm, qdap, RWeka…). Moreover, with slam package, we don’t need to massive RAM to aggregate term count.
  3. Ignoring the stopwords shows us most popular terms for 1, 2, and 3 grams. Howeverm, it might be useful to build our model with the stopwords since it is a part of the language.
  4. Detecting foreign language or weird terms (aa, aaa, abbbb,…) is challenging but will increase our model accuracy.