This is an R Markdown document used to perform exploratory analysis of the data set.
It needs tm and SnowballC packages to do the text processing. Therefore, these packages will be installed if not available.
list.of.packages <- c("tm","SnowballC","ggplot2","wordcloud","RWeka")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
The sizes of three raw data files are shown. In order to get the calculations done in timely manner, we randomly sample 1% of total lines from each input file and use it to perform the exploratory data analysis
set.seed(416)
files = Sys.glob("./*.txt")
sample_percent <- 0.01
test <- list()
for(fileName in files){
print(paste(fileName,' has ', file.info(fileName)$size,' bytes'))
content <- readLines(fileName,encoding = 'UTF-8')
test <- c(test,sample(content,sample_percent*length(content)))
}
## [1] "./en_US.blogs.txt has 210160014 bytes"
## [1] "./en_US.news.txt has 205811889 bytes"
## [1] "./en_US.twitter.txt has 167105338 bytes"
## [1] "./sample.txt has 3961709 bytes"
dir.create("./sample",showWarnings = FALSE)
fileConn<-file("./sample/sample.txt")
writeLines(as.character(test),fileConn)
close(fileConn)
R package tm for “text mining” has useful functions for processing text such as removing puntuation, stopwords and words from same stems.
library(tm)
## Loading required package: NLP
files <- DirSource(directory = "./sample",encoding ="latin1" )
corpus <- VCorpus(x=files)
summary(corpus)
## Length Class Mode
## sample.txt 2 PlainTextDocument list
corpus <- tm_map(corpus, content_transformer(tolower)) #lowercase
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = FALSE) # remove punctuation
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove stopwords
corpus <- tm_map(corpus, stemDocument) # reduce word forms to stems
We count the occurance of single word in terms of the most and least frequencies.
corpus <- tm_map(corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(corpus)
freq <- colSums(as.matrix(dtm))
#create sort order (descending)
ord <- order(freq,decreasing=TRUE)
#inspect most frequently occurring terms
freq[head(ord)]
## just like get one will time
## 2545 2421 2418 2255 2195 1979
#inspect least frequently occurring terms
freq[tail(ord)]
## zupan zurcher zurich zygot zynga zzub
## 1 1 1 1 1 1
We can also plot the histogram of frequency, for example:
wf=data.frame(term=names(freq),occurrences=freq)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
p <- ggplot(subset(wf, freq>1500), aes(term, occurrences)) + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
With the R package Wordcloud, we can visualize the top 50 Unigrams
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(39)
tm_unifreq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
Suppose you want to count word sequences of length 2, also called word bigrams. We can use package RWeka
library(RWeka)
# Tokenizer
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
head(tm_biwordfreq,5)
## word freq
## right now right now 232
## look like look like 183
## cant wait cant wait 175
## look forward look forward 164
## last night last night 160
p <- ggplot(subset(tm_biwordfreq, freq>100), aes(word, freq)) + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
# word cloud
set.seed(39)
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
wordcloud(names(tm_bifreq), tm_bifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))