library(tidyverse)
library(stringr)
library(qdap)
library(wordcloud)
library(tm)
library(caret)
library(RWeka)
library(readr)
library(slam)
Without using Hadoop or MapReduce, it is impossible to figure out the word counts using R. However, we can approximate using average character per words in english is about 5 + 1 whitespace each. This is a workaround for users without big data analytics experience.
blogs_size <- file.info("./en_US/en_US.blogs.txt")$size/10^6
con <- file("./en_US/en_US.blogs.txt")
temp <- read_lines(con)
blogs_line <- length(temp)
blogs_word <- sum(nchar(temp))/6
sampleblogs <- sample(temp, 10000)
rm(temp)
twitter_size <- file.info("./en_US/en_US.twitter.txt")$size/10^6
con <- file("./en_US/en_US.twitter.txt")
temp <- read_lines(con)
twitter_line <- length(temp)
twitter_word <- sum(nchar(temp))/6
sampletwitter <- sample(temp, 10000)
rm(temp)
news_size <- file.info("./en_US/en_US.news.txt")$size/10^6
con <- file("./en_US/en_US.news.txt")
temp <- read_lines(con)
news_line <- length(temp)
news_word <- sum(nchar(temp))/6
samplenews <- sample(temp, 10000)
rm(temp)
tab <- data_frame(name = c("blogs","twitter","news"),
size_mb = c(blogs_size, twitter_size, news_size),
lines_count = c(blogs_line, twitter_line, news_line),
word_count = c(blogs_word, twitter_word, news_word))
tab
## # A tibble: 3 x 4
## name size_mb lines_count word_count
## <chr> <dbl> <int> <dbl>
## 1 blogs 210.1600 899288 34470751
## 2 twitter 167.1053 2360148 27016005
## 3 news 205.8119 1010242 33870527
Because of limited RAM, we will take a small samples from blogs, twitter and news and create our small dataset called all_sample. Using 10000 observations from each (30000 observations in total) would be sufficient for building our first trial models.
Before continue, we will save the dataset for later use and remove all files from workspace to save memory.
all_samples <- c(sampleblogs, sampletwitter, samplenews)
write_lines(all_samples, "./all_samples.txt")
rm(list = ls())
##Cleaning latin characters and such
odd_clean <- function(x){
x <- iconv(x, "latin1", "ASCII", sub = "")
return(x)
}
##Transform all to text
qdap_clean <- function(x){
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
##Download bad_words library
URL <-"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
download.file(URL, destfile = "bad_words.txt")
bad_words <- read_lines("bad_words.txt")
##Remove stopwords, bad words, punctuation, numbers and whitespaces.
tm_clean <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("en"),bad_words))
return(corpus)
}
Bad words library borrowed at https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en
all_samples <- read_lines("./all_samples.txt")
all_samples <- sample(all_samples, size = length(all_samples))
all_samples <- odd_clean(all_samples)
all_samples <- qdap_clean(all_samples)
sample_corpus <- VCorpus(VectorSource(all_samples))
sample_corpus <- tm_clean(sample_corpus)
We will now create TDM and start extracting features from dataset.
rm(bad_words); rm(odd_clean); rm(qdap_clean); rm(tm_clean); rm(URL); rm(all_samples)
sample_tdm <- TermDocumentMatrix(sample_corpus)
sample_freq <- row_sums(sample_tdm)
wordcloud(names(sample_freq),sample_freq,max.words = 50, color= brewer.pal(4, "Paired"))
top25_1gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_1gram <- rownames_to_column(as.data.frame(top25_1gram))
names(top25_1gram) <- c("word","count")
ggplot(top25_1gram, aes(x = word, y = count)) + geom_col(fill = "lightblue3", col = "black") + coord_flip() + scale_x_discrete(limits = top25_1gram$word)
rm(sample_freq); rm(sample_tdm)
##Create bigram tokenizer options
tokenizer <- function(x){
NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
sample_tdm <- TermDocumentMatrix(sample_corpus, control = list(tokenize = tokenizer))
sample_freq <- row_sums(sample_tdm, na.rm = TRUE)
wordcloud(names(sample_freq),sample_freq,max.words = 25, color= brewer.pal(5, "Set2"))
top25_2gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_2gram <- rownames_to_column(as.data.frame(top25_2gram))
names(top25_2gram) <- c("word","count")
ggplot(top25_2gram, aes(x = word, y = count)) + geom_col(fill = "coral2", col = "black") + coord_flip() + scale_x_discrete(limits = top25_2gram$word)
rm(sample_freq); rm(sample_tdm)
##Create bigram tokenizer options
tokenizer <- function(x){
NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
sample_tdm <- TermDocumentMatrix(sample_corpus, control = list(tokenize = tokenizer))
sample_freq <- row_sums(sample_tdm, na.rm = TRUE)
wordcloud(names(sample_freq),sample_freq,max.words = 25, color= brewer.pal(5, "Dark2"))
top25_3gram <- sample_freq[order(sample_freq, decreasing = TRUE)][1:25]
top25_3gram <- rownames_to_column(as.data.frame(top25_3gram))
names(top25_3gram) <- c("word","count")
ggplot(top25_3gram, aes(x = word, y = count)) + geom_col(fill = "darkblue", col = "black") + coord_flip() + scale_x_discrete(limits = top25_3gram$word)