Data Science Capstone - Milestone Report

Abstract

This is an report for the basic exploratory data analysis of Natural Language Processing for English text from news, blogs and twitters.

There are few steps for me done in this report: 1. basic data table including file size, text word and line count. 2. wordcloud and bar plot of a sample text. 3. codes about “n grams tokenizer” but not run (because of the java problem).

Load Packages

library(knitr)
library(NLP)
library(tm)
library(stringi)
library(stringr)
library(RWeka)
library(wordcloud)

## Loading required package: RColorBrewer

Basic File Processing

#file path
twitter.path<-"~/Desktop/Capstone/final/en_US/en_US.twitter.txt"
blogs.path<-"~/Desktop/Capstone/final/en_US/en_US.blogs.txt"
news.path<-"~/Desktop/Capstone/final/en_US/en_US.news.txt"

#load file
twitter <- readLines(twitter.path, encoding="UTF-8")

## Warning in readLines(twitter.path, encoding = "UTF-8"): line 167155 appears
## to contain an embedded nul

## Warning in readLines(twitter.path, encoding = "UTF-8"): line 268547 appears
## to contain an embedded nul

## Warning in readLines(twitter.path, encoding = "UTF-8"): line 1274086
## appears to contain an embedded nul

## Warning in readLines(twitter.path, encoding = "UTF-8"): line 1759032
## appears to contain an embedded nul

blogs <- readLines(blogs.path, encoding="UTF-8")
news  <- readLines(news.path, encoding="UTF-8")

#size count in MB
sc<-file.info(c(twitter.path,blogs.path,news.path))$size/1024/1024 
sc

## [1] 159.3641 200.4242 196.2775

#word count
wc<-rbind(
  summary(stri_count_words(twitter)),
  summary(stri_count_words(blogs)),
  summary(stri_count_words(news))
)
row.names(wc)<-c("twitter","blogs","news")

wcs<-c(
  sum(stri_count_words(twitter)),
  sum(stri_count_words(blogs)),
  sum(stri_count_words(news))
)
wc

##         Min. 1st Qu. Median  Mean 3rd Qu. Max.
## twitter    1       7     12 12.75      18   47
## blogs      0       9     28 41.75      60 6726
## news       1      19     32 34.41      46 1796

wcs

## [1] 30093369 37546246 34762395

#line count
lc<-cbind(
  stri_stats_general(twitter),
  stri_stats_general(blogs),
  stri_stats_general(news)
)
colnames(lc)<-c("twitter","blogs","news")
lc

##               twitter     blogs      news
## Lines         2360148    899288   1010242
## LinesNEmpty   2360148    899288   1010242
## Chars       162096031 206824382 203223154
## CharsNWhite 134082634 170389539 169860866

#basic data table
bdt<-cbind(lc[1,],sc,wcs)
colnames(bdt)<-c("line count","size in MB","word count")
bdt

##         line count size in MB word count
## twitter    2360148   159.3641   30093369
## blogs       899288   200.4242   37546246
## news       1010242   196.2775   34762395

Corpus Processing

#the all corpus is too large and consume a lot time
#only show the codes but not run
corpus.a<- Corpus(DirSource("~/Desktop/Capstone/final/en_US/"))
corpus.a <- tm_map(corpus.a, stripWhitespace)
corpus.a <- tm_map(corpus.a, removeNumbers)
corpus.a <- tm_map(corpus.a, removePunctuation)
corpus.a <- tm_map(corpus.a, removeWords, stopwords())

Wordcloud and Bar Plot

#get 20% and first 2500 line sample of total corpus
set.seed(24)
blogs.st <- blogs[sample(length(blogs), 0.2*length(blogs))]
news.st <- news[sample(length(news), 0.2*length(news))]
twitter.st <- twitter[sample(length(twitter), 0.2*length(twitter))]

corpus.s <- VCorpus(VectorSource(
  paste(
    blogs.st[1:2500], news.st[1:2500], twitter.st[1:2500]
    )
  ))

# use tm package for basic process of text mining
corpus.s <- tm_map(corpus.s, content_transformer(tolower)) 
corpus.s <- tm_map(corpus.s, stripWhitespace)
corpus.s <- tm_map(corpus.s, removeNumbers)
corpus.s <- tm_map(corpus.s, removePunctuation)
corpus.s <- tm_map(corpus.s, removeWords, stopwords("english"))

#wordcloud plot
cor.mat <- as.matrix(DocumentTermMatrix(corpus.s))
frequency <- sort(colSums(cor.mat),decreasing = T)
wordcloud(names(frequency)[1:25], frequency[1:25])

#bar plot
barplot(frequency[1:25],
        ylab='token frequency',
        main='top 20 most frequent words in all files',
        names.arg=names(frequency)[1:25],        
        col="blue", las=2, cex.names=.7)

N Grams Remarks

Since in my working mac, it comes up with the below error, I only finish this part in another PC and only show the codes here. (although I try to seek help from discussion and stackflow, I have not get the answer yet)

“Error in .jnew(name) : java.lang.UnsupportedClassVersionError: weka/core/tokenizers/NGramTokenizer : Unsupported major.minor version 51.0”

#n gram for corpus
corpus.d <-data.frame(text=unlist(sapply(corpus.s, 
                                         `[`, "content")), stringsAsFactors=F)

find.n.gram <- function(x, n) {
  n.gram <- NGramTokenizer(x, Weka_control(min = n, max = n))
  n.gram <- data.frame(table(n.gram))
  n.gram <- ngram[order(n.gram$Freq,decreasing = TRUE),][1:100,]
  colnames(n.gram) <- c("String","Count")
  n.gram
}

#for 2,3,4 grams, and similar way to 1 gram for drawing wordcloud and bar plot.
grams2 <- find.n.gram(corpus.d, 2)
grams3 <- find.n.gram(corpus.d, 3)
grams4 <- find.n.gram(corpus.d, 4)