This document reports on the following:
We consider files of the english language
list.files("en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
We perform data pre-processing to inspect most common words, most common ngrams etc.
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(SnowballC)
library(biclust)
## Loading required package: MASS
## Loading required package: grid
## Loading required package: colorspace
## Loading required package: lattice
library(cluster)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(fpc)
library(Rcampdf)
library(caret)
library(openNLP)
library(NLP)
library(ngram)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:igraph':
##
## %>%, diversity
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
library(dynamicTreeCut)
set.seed(1234)
1000 lines from US blogs, news and twits are loaded intoa variable “samples”.
blogs <- readLines("en_US/en_US.blogs.txt")
news <- readLines("en_US/en_US.news.txt")
## Warning in readLines("en_US/en_US.news.txt"): incomplete final line found
## on 'en_US/en_US.news.txt'
twits <- readLines("en_US/en_US.twitter.txt")
## Warning in readLines("en_US/en_US.twitter.txt"): line 167155 appears to
## contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt"): line 268547 appears to
## contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt"): line 1759032 appears to
## contain an embedded nul
samples <- c(sample(blogs, 1000), sample(news, 1000), sample(twits, 1000))
samples <- iconv(samples, "UTF-8", "latin1")
setwd("dataset")
write.table(samples,"text_US.txt",row.names=FALSE,col.names=FALSE,quote=FALSE,append=FALSE)
The “samples” variable containing the text is converted to a Corpus and processed using the “tm” library.
cname <- file.path("C:", "Users", "Debanjan", "Dropbox", "COURSERA", "final", "dataset")
docs <- Corpus(DirSource(cname))
summary(docs)
## Length Class Mode
## text_US.txt 2 PlainTextDocument list
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, content_transformer(tolower) )
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, PlainTextDocument)
The data is then staged to explore word counts, n-gram counts.
dtm <- DocumentTermMatrix(docs)
tdm <- TermDocumentMatrix(docs)
inspect(tdm[1:15,1])
## <<TermDocumentMatrix (terms: 15, documents: 1)>>
## Non-/sparse entries: 15/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## <U+0091>that 1
## <U+0092>em 1
## <U+0093><U+0092>s 1
## <U+0093>let 1
## aam 2
## aaron 5
## aarp 2
## abandon 1
## abbey 1
## abbi 2
## abbrevi 1
## abc 1
## abdomen 1
## abdomin 1
## abduct 1
Firstly, the words in the corpus are arranged by increasing order of the frequency of their occurrence.
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)
## one said just get will time like year can make know love day say
## 277 269 250 239 233 222 206 195 194 175 169 163 156 145
# Removing sparse terms
dtms <- removeSparseTerms(dtm, 0.1)
table(freq)
## freq
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 5383 1401 697 422 302 206 166 143 113 91 80 71 57 72 47
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 33 44 33 33 34 25 19 23 23 21 21 21 22 13 12
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 12 7 13 17 9 7 8 6 8 4 6 5 1 2 4
## 46 47 48 49 50 51 52 53 54 55 56 57 58 60 64
## 9 9 2 4 2 2 2 4 3 2 4 1 3 3 2
## 66 67 68 69 70 71 72 74 76 77 79 80 82 85 86
## 1 3 2 3 1 2 2 3 3 2 1 2 1 3 1
## 87 91 92 95 98 100 105 111 121 122 123 124 126 127 132
## 3 1 1 1 1 1 1 1 2 2 1 1 1 2 2
## 133 142 144 145 156 163 169 175 194 195 206 222 233 239 250
## 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1
## 269 277
## 1 1
A table of “freq” shows that there are many words that occur only once, fewer words that occur more than about 40 times and a very few words that occur more than 90 times.
The following plots show us which are the common words
wf <- data.frame(word = names(freq), freq=freq)
p <- ggplot(subset(wf, freq>90 & freq < 300), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
wordcloud(names(freq), freq, min.freq=60, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
## Warning in wordcloud(names(freq), freq, min.freq = 60, scale = c(5, 0.1), :
## one could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freq), freq, min.freq = 60, scale = c(5, 0.1), :
## year could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freq), freq, min.freq = 60, scale = c(5, 0.1), :
## know could not be fit on page. It will not be plotted.
dtmss <- removeSparseTerms(dtms, 0.2)
hc <- hclust(dist(t(dtmss)))
hcd1 <- as.dendrogram(hc)
hcd2 <- as.dendrogram(hc)
plot(cut(hcd2, h=150)$lower[[2]],
main="Hierarchial clustering")
require(RWeka)
TwogramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmn <- TermDocumentMatrix(docs, control = list(tokenize = TwogramTokenizer))
tdmn <- removeSparseTerms(tdmn, 0.75)
inspect(tdmn[501:515,1])
## <<TermDocumentMatrix (terms: 15, documents: 1)>>
## Non-/sparse entries: 15/0
## Sparsity : 0%
## Maximal term length: 18
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## administr just 1
## administr presid 1
## administr taken 1
## admir ambit 1
## admir philanthropi 1
## admiss wac 1
## admit cross 1
## admit even 1
## admit heath 1
## admit littl 1
## admit mayb 1
## admit tricki 1
## admit use 1
## adopt certain 1
## adopt paytoplay 1
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmn2 <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer))
tdmn2 <- removeSparseTerms(tdmn2, 0.75)
inspect(tdmn2[501:515,1])
## <<TermDocumentMatrix (terms: 15, documents: 1)>>
## Non-/sparse entries: 15/0
## Sparsity : 0%
## Maximal term length: 25
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## adulthood face full 1
## advanc actor found 1
## advanc get recognit 1
## advanc idlib part 1
## advanc instrument system 1
## advanc look design 1
## advanc present chanc 1
## advanc student school 1
## advantag role host 1
## advent digit watch 1
## advent sound without 1
## adventur along sightse 1
## adventur baron munchausen 1
## adventur door face 1
## adventur end think 1
The deduced n-grams are sorted by decreasing order of occurrence.
tdmnMatrix <- as.matrix(tdmn)
tf1grams <- rowSums(tdmnMatrix)
tf1sorted <- sort(tf1grams, decreasing = TRUE)
tf1noones <- tf1sorted[which(tf1sorted != 1)]
head(tf1sorted, 10)
## last year right now new york dont know high school last week
## 27 20 19 17 13 13
## look like make sure cant wait last night
## 13 13 12 12