Data Cleaning and Pre-processing

Loading raw data

Loading data from text files:

tweets <- readLines('en_US.twitter.txt') # 2.3M
blogs <- readLines('en_US.blogs.txt') # 900K
news <- readLines('en_US.news.txt') # 80K

In order to reduce processing time, I create a sampe of about 100K rows:

tweets.sample <- sample(tweets, size=round(length(tweets) * 0.01), replace=FALSE)
blogs.sample <- sample(blogs, size=round(length(blogs) * 0.03), replace=FALSE)
news.sample <- sample(news, size=round(length(news) * 0.3), replace=FALSE)

Combine samples from three files into a single dataset:

txt <- c(tweets.sample, blogs.sample, news.sample) # 200K=37Mb

Then I save this raw data sample into a R-native file:

save(txt,file="sample.RData")

When needed, it can be loaded quickly without prior processing:

load("sample.RData")

Cleaning raw data

toSpace <- function(x, pattern) {gsub(pattern, " ", x)}
corpus <- Corpus(VectorSource(txt), readerControl=list(reader=readPlain, language="en_US", load=TRUE))
corpus <- tm_map(corpus, content_transformer(toSpace), "/|@|//|$|:|:)|*|&|!|?|_|-|#|") 
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, FUN=stripWhitespace)
corpus <- tm_map(corpus, FUN=removeNumbers)
corpus <- tm_map(corpus, FUN=removePunctuation)
corpus <- tm_map(corpus, FUN=removeWords, stopwords("english"))
corpus <- tm_map(corpus, FUN=removeWords, c("the", "will", "The", "also", "that", "and", "for", "in", "is", "it", "not", "to"))
save(corpus,file="sample_corpus.RData")

At this point we do not need raw data and those variables can be remove from memory:

rm(news)
rm(blogs)
rm(tweets)
gc()

Create helper functions for tockenizing:

NGramTokenizer1 <- function(x) unlist(lapply(NLP::ngrams(words(x), 1), paste, collapse=" "), use.names=FALSE)
NGramTokenizer2 <- function(x) unlist(lapply(NLP::ngrams(words(x), 2), paste, collapse=" "), use.names=FALSE)
NGramTokenizer3 <- function(x) unlist(lapply(NLP::ngrams(words(x), 3), paste, collapse=" "), use.names=FALSE)
NGramTokenizer4 <- function(x) unlist(lapply(NLP::ngrams(words(x), 4), paste, collapse=" "), use.names=FALSE)
NGramTokenizer5 <- function(x) unlist(lapply(NLP::ngrams(words(x), 5), paste, collapse=" "), use.names=FALSE)
NGramTokenizer8 <- function(x) unlist(lapply(NLP::ngrams(words(x), 8), paste, collapse=" "), use.names=FALSE)

Then I create N-Grams for 1,2,3,4,5,8 words and save them in a file, so that I will not have to go trough this procedure every time. Also looking at file dates we can see how long it took for each step:

ng1 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer1))
save(ng1,file="NGram-1.RData")

ng2 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer2))
save(ng2,file="NGram-2.RData")

ng3 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer3))
save(ng3,file="NGram-3.RData")

ng4 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer4))
save(ng4,file="NGram-4.RData")

ng5 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer5))
save(ng5,file="NGram-5.RData")

ng8 <- TermDocumentMatrix(corpus, control=list(tokenize=NGramTokenizer8))
save(ng8,file="NGram-8.RData")

Packages used

The following packages were used:

wordcloud and RColorBrewer - for plotting heat map of frequent words

NLP and tm - for tockenizing and N-Gram analysis

ggplot2 - for plotting histogramms

plyr - for aggregating datasets

Data Analysis

First of all, let’s clean all variables from R environment and start from scratch.

rm(list=ls())

Load N-Grams

We stored N-Grams into files on disk in R-native format, so we can now quickly upload it into memory without a need to process raw data again.

load(file=paste(foldername,"NGram-1.RData",sep=""))
load(file=paste(foldername,"NGram-2.RData",sep=""))
load(file=paste(foldername,"NGram-3.RData",sep=""))
#load(file=paste(foldername,"NGram-4.RData",sep=""))
#load(file=paste(foldername,"NGram-5.RData",sep=""))
#load(file=paste(foldername,"NGram-8.RData",sep="")) 

Plot a word cloud

w <- findFreqTerms(ng1, lowfreq = 500)
wf <- rowSums(as.matrix(ng1[w,]))
wf <- data.frame(trigram=names(wf), frequency=wf)
#wordFrequency <- arrange(wordFrequency, desc(frequency))
df <- data.frame(wf[,c('trigram','frequency')], row.names = NULL)
wordcloud(df$trigram, df$frequency, min.freq=100, colors=brewer.pal(6, "Dark2"))

Plot bigramms histogramm

w <- findFreqTerms(ng2, lowfreq = 100)
wf <- rowSums(as.matrix(ng2[w,]))
df <- data.frame(word=names(wf), frequency=wf, row.names = NULL)

ggplot(df[1:15,], aes(x=reorder(word, frequency), y=frequency)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("2-gram words") + ylab("Frequency in the sample") +
  ggtitle('Most Common 2-Grams')

Plot trigramms histogramm

I’m pretty surprised to see “World War II” in the list of frequent terms, I was under impression it’s been forgotten. That’s a good news that people keep remembering history.

w <- findFreqTerms(ng3, lowfreq = 20)
wf <- rowSums(as.matrix(ng3[w,]))
df <- data.frame(word=names(wf), frequency=wf, row.names = NULL)

ggplot(df[1:15,], aes(x=reorder(word, frequency), y=frequency)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("3-gram words") + ylab("Frequency in the sample") +
  ggtitle('Most Common 3-Grams')

Future opportunities

Having prepared frequent word combinations I plan to use 4-gram and 5-gram for predicting next word.