JH10_EDA&MS

Data and Package Loading

library(tm)

## Loading required package: NLP

library(RWeka)
library(ngram)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

cdr<-"~/Dropbox/Univ/Coursera/JH_R/JH_Data_Science/10-Capstone/final/en_US"
### reading files
data.blogs<- readLines(file.path(cdr,"en_US.blogs.txt"))
data.news<- readLines(file.path(cdr, "en_US.news.txt"))
data.twitter<- readLines(file.path(cdr,"en_US.twitter.txt") ,skipNul = T)

examining the data

Source <- c("blogs", "news", "twitter")
Size.MB <- c(file.size(file.path(cdr, "en_US.blogs.txt")) /1024^2,file.size(file.path(cdr, "en_US.news.txt"))/1024^2,file.size(file.path(cdr, "en_US.twitter.txt"))/1024^2)
Number.linesK <- c(length(data.blogs)/1e3,length(data.news)/1e3,length(data.twitter)/1e3)
Number.wordsK <- c(wordcount(data.blogs)/1e3,wordcount(data.news)/1e3,wordcount(data.twitter)/1e3)
data.frame(Source, Size.MB, Number.linesK, Number.wordsK)

##    Source  Size.MB Number.linesK Number.wordsK
## 1   blogs 200.4242       899.288      37334.13
## 2    news 196.2775      1010.242      34372.53
## 3 twitter 159.3641      2360.148      30373.58

Word Frequencies

## Sampling data
set.seed(123)
sample_rate<-0.003
data.sample <- c(sample(data.blogs,length(data.blogs) *sample_rate),sample(data.news,   length(data.news)* sample_rate),sample(data.twitter,length(data.twitter) * sample_rate))
## Creating corpus and cleaning data
options(mc.cores=2)
corpus <- VCorpus(VectorSource(data.sample))

## Warning in as.POSIXlt.POSIXct(Sys.time(), tz = "GMT"): unknown timezone
## 'default/Asia/Tokyo'

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
## extracting n-grams
UniToken<-function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiToken<-function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriToken<-function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
### creating UniGram
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize=UniToken))
fm1<-rowSums(as.matrix(tdm1))
UniGram<-data.frame(word=names(fm1),freq=fm1)
UniGram<-UniGram[order(-UniGram$freq),]
head(UniGram)

##      word freq
## will will  979
## just just  962
## said said  887
## one   one  828
## like like  764
## can   can  745

### creating BiGram
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize=BiToken))
fm2<-rowSums(as.matrix(tdm2))
BiGram<-data.frame(word=names(fm2),freq=fm2)
BiGram<-BiGram[order(-BiGram$freq),]      
head(BiGram)

##                    word freq
## right now     right now   90
## last year     last year   60
## new york       new york   58
## last night   last night   49
## high school high school   43
## years ago     years ago   41

### creating TriGram
tdm3<-removeSparseTerms(TermDocumentMatrix(corpus,control = list(tokenize=TriToken)),0.9999)
fm3<-rowSums(as.matrix(tdm3))
TriGram<-data.frame(word=names(fm3),freq=fm3)
TriGram<-TriGram[order(-TriGram$freq),]        
head(TriGram)

##                                word freq
## happy mothers day happy mothers day   12
## new york city         new york city   10
## happy new year       happy new year    8
## osama bin laden     osama bin laden    6
## new years eve         new years eve    5
## cinco de mayo         cinco de mayo    4

Ploting n-grams

make_plot <- function(data, label) {
  ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 90, size = 11, hjust = 1)) +
         geom_bar(stat = "identity")}
make_plot(UniGram, "Unigrams")

make_plot(BiGram, "Bigrams")

make_plot(TriGram, "Trigrams")

There are very common word in Unigram(word appears much more often then other). This characterestics become less extreme for Bigram and very weak in Trigram.

Coverae by n-grams

Examine word coverage for 50% and 90%, then plot number of word by chaning coverage. Same analysis for trigram.

coverage<-function(x,cover_rate) #x is the unigram sorted by frequency, cover_rate is the percent word coverage
  {nwords<-0 # initial counter
  cover<-cover_rate*sum(x$freq) # number of words to hit coverage
  for (i in 1:nrow(x))
    {if (nwords >= cover) {return (i)}
    nwords<-nwords+x$freq[i]
  }}
coverage(UniGram,0.5)

## [1] 967

coverage(UniGram,0.9)

## [1] 12024

visualize coverage and word required

x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(UniGram, i)}

qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of words", main="Number of words required to attain coverage")

x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(BiGram, i)}

qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of BiGram", main="Number of BiGram required to attain coverage")

The curve of number of words by coverage, suggest there is some trade-off to consider; coverage and memory utilization. This would be considered in the later predictive analysis.

Plan for creating predictive model

In coming predictive model building as a Shiny appliction, I would cosider the followings: - Efficiency in memory utilization (think Markov Chains).

Using knowledge on word frequencies to balance coverage and efficiency, using either number of word or coverage as a parameter.
Smoothings to be considered.
Model accuracy with some test sentences (from newspaper, etc.)for evaluation.
Use some flas to check if any n-grams is not incorpus(even with the smoothing).
Memory utilization of the model and time should be measured.

JH10_EDA&MS

Takashi Sendo

2017/11/15