This is an R Markdown document used to perform exploratory analysis of the data set.

Check and Install Necessary Packages

It needs tm and SnowballC packages to do the text processing. Therefore, these packages will be installed if not available.

list.of.packages <- c("tm","SnowballC","ggplot2","wordcloud","RWeka")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

Determine the File size and sample the raw data

The sizes of three raw data files are shown. In order to get the calculations done in timely manner, we randomly sample 1% of total lines from each input file and use it to perform the exploratory data analysis

set.seed(416)
files = Sys.glob("./*.txt")
sample_percent <- 0.01
test <- list()
for(fileName in files){  
  print(paste(fileName,' has ', file.info(fileName)$size,' bytes'))
  content <- readLines(fileName,encoding = 'UTF-8')
  test <- c(test,sample(content,sample_percent*length(content)))
}

## [1] "./en_US.blogs.txt  has  210160014  bytes"
## [1] "./en_US.news.txt  has  205811889  bytes"
## [1] "./en_US.twitter.txt  has  167105338  bytes"
## [1] "./sample.txt  has  3961709  bytes"

dir.create("./sample",showWarnings = FALSE)
fileConn<-file("./sample/sample.txt")
writeLines(as.character(test),fileConn)
close(fileConn)

Pre-process Text with tm package

R package tm for “text mining” has useful functions for processing text such as removing puntuation, stopwords and words from same stems.

library(tm)

## Loading required package: NLP

files <- DirSource(directory = "./sample",encoding ="latin1" )
corpus <- VCorpus(x=files)
summary(corpus)

##            Length Class             Mode
## sample.txt 2      PlainTextDocument list

corpus <- tm_map(corpus, content_transformer(tolower)) #lowercase
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = FALSE) # remove punctuation
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove stopwords
corpus <- tm_map(corpus, stemDocument) # reduce word forms to stems

Exploratory Data Analysis

Most and Least Frequently Occuring Words

We count the occurance of single word in terms of the most and least frequencies.

corpus <- tm_map(corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(corpus)
freq <- colSums(as.matrix(dtm))
#create sort order (descending)
ord <- order(freq,decreasing=TRUE)
#inspect most frequently occurring terms
freq[head(ord)]

## just like  get  one will time 
## 2545 2421 2418 2255 2195 1979

#inspect least frequently occurring terms
freq[tail(ord)]

##   zupan zurcher  zurich   zygot   zynga    zzub 
##       1       1       1       1       1       1

We can also plot the histogram of frequency, for example:

wf=data.frame(term=names(freq),occurrences=freq)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

p <- ggplot(subset(wf, freq>1500), aes(term, occurrences)) + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

With the R package Wordcloud, we can visualize the top 50 Unigrams

library(wordcloud)

## Loading required package: RColorBrewer

set.seed(39)
tm_unifreq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

Counting n-grams instead of words

Suppose you want to count word sequences of length 2, also called word bigrams. We can use package RWeka

library(RWeka)
# Tokenizer
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
head(tm_biwordfreq,5)

##                      word freq
## right now       right now  232
## look like       look like  183
## cant wait       cant wait  175
## look forward look forward  164
## last night     last night  160

p <- ggplot(subset(tm_biwordfreq, freq>100), aes(word, freq)) + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

# word cloud
set.seed(39)
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
wordcloud(names(tm_bifreq), tm_bifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

SwiftKey Milestone Report