This is an report for the basic exploratory data analysis of Natural Language Processing for English text from news, blogs and twitters.
There are few steps for me done in this report: 1. basic data table including file size, text word and line count. 2. wordcloud and bar plot of a sample text. 3. codes about “n grams tokenizer” but not run (because of the java problem).
library(knitr)
library(NLP)
library(tm)
library(stringi)
library(stringr)
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
#file path
twitter.path<-"~/Desktop/Capstone/final/en_US/en_US.twitter.txt"
blogs.path<-"~/Desktop/Capstone/final/en_US/en_US.blogs.txt"
news.path<-"~/Desktop/Capstone/final/en_US/en_US.news.txt"
#load file
twitter <- readLines(twitter.path, encoding="UTF-8")
## Warning in readLines(twitter.path, encoding = "UTF-8"): line 167155 appears
## to contain an embedded nul
## Warning in readLines(twitter.path, encoding = "UTF-8"): line 268547 appears
## to contain an embedded nul
## Warning in readLines(twitter.path, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul
## Warning in readLines(twitter.path, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul
blogs <- readLines(blogs.path, encoding="UTF-8")
news <- readLines(news.path, encoding="UTF-8")
#size count in MB
sc<-file.info(c(twitter.path,blogs.path,news.path))$size/1024/1024
sc
## [1] 159.3641 200.4242 196.2775
#word count
wc<-rbind(
summary(stri_count_words(twitter)),
summary(stri_count_words(blogs)),
summary(stri_count_words(news))
)
row.names(wc)<-c("twitter","blogs","news")
wcs<-c(
sum(stri_count_words(twitter)),
sum(stri_count_words(blogs)),
sum(stri_count_words(news))
)
wc
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## twitter 1 7 12 12.75 18 47
## blogs 0 9 28 41.75 60 6726
## news 1 19 32 34.41 46 1796
wcs
## [1] 30093369 37546246 34762395
#line count
lc<-cbind(
stri_stats_general(twitter),
stri_stats_general(blogs),
stri_stats_general(news)
)
colnames(lc)<-c("twitter","blogs","news")
lc
## twitter blogs news
## Lines 2360148 899288 1010242
## LinesNEmpty 2360148 899288 1010242
## Chars 162096031 206824382 203223154
## CharsNWhite 134082634 170389539 169860866
#basic data table
bdt<-cbind(lc[1,],sc,wcs)
colnames(bdt)<-c("line count","size in MB","word count")
bdt
## line count size in MB word count
## twitter 2360148 159.3641 30093369
## blogs 899288 200.4242 37546246
## news 1010242 196.2775 34762395
#the all corpus is too large and consume a lot time
#only show the codes but not run
corpus.a<- Corpus(DirSource("~/Desktop/Capstone/final/en_US/"))
corpus.a <- tm_map(corpus.a, stripWhitespace)
corpus.a <- tm_map(corpus.a, removeNumbers)
corpus.a <- tm_map(corpus.a, removePunctuation)
corpus.a <- tm_map(corpus.a, removeWords, stopwords())
#get 20% and first 2500 line sample of total corpus
set.seed(24)
blogs.st <- blogs[sample(length(blogs), 0.2*length(blogs))]
news.st <- news[sample(length(news), 0.2*length(news))]
twitter.st <- twitter[sample(length(twitter), 0.2*length(twitter))]
corpus.s <- VCorpus(VectorSource(
paste(
blogs.st[1:2500], news.st[1:2500], twitter.st[1:2500]
)
))
# use tm package for basic process of text mining
corpus.s <- tm_map(corpus.s, content_transformer(tolower))
corpus.s <- tm_map(corpus.s, stripWhitespace)
corpus.s <- tm_map(corpus.s, removeNumbers)
corpus.s <- tm_map(corpus.s, removePunctuation)
corpus.s <- tm_map(corpus.s, removeWords, stopwords("english"))
#wordcloud plot
cor.mat <- as.matrix(DocumentTermMatrix(corpus.s))
frequency <- sort(colSums(cor.mat),decreasing = T)
wordcloud(names(frequency)[1:25], frequency[1:25])
#bar plot
barplot(frequency[1:25],
ylab='token frequency',
main='top 20 most frequent words in all files',
names.arg=names(frequency)[1:25],
col="blue", las=2, cex.names=.7)
Since in my working mac, it comes up with the below error, I only finish this part in another PC and only show the codes here. (although I try to seek help from discussion and stackflow, I have not get the answer yet)
“Error in .jnew(name) : java.lang.UnsupportedClassVersionError: weka/core/tokenizers/NGramTokenizer : Unsupported major.minor version 51.0”
#n gram for corpus
corpus.d <-data.frame(text=unlist(sapply(corpus.s,
`[`, "content")), stringsAsFactors=F)
find.n.gram <- function(x, n) {
n.gram <- NGramTokenizer(x, Weka_control(min = n, max = n))
n.gram <- data.frame(table(n.gram))
n.gram <- ngram[order(n.gram$Freq,decreasing = TRUE),][1:100,]
colnames(n.gram) <- c("String","Count")
n.gram
}
#for 2,3,4 grams, and similar way to 1 gram for drawing wordcloud and bar plot.
grams2 <- find.n.gram(corpus.d, 2)
grams3 <- find.n.gram(corpus.d, 3)
grams4 <- find.n.gram(corpus.d, 4)