Data and Package Loading

library(tm)
## Loading required package: NLP
library(RWeka)
library(ngram)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
cdr<-"~/Dropbox/Univ/Coursera/JH_R/JH_Data_Science/10-Capstone/final/en_US"
### reading files
data.blogs<- readLines(file.path(cdr,"en_US.blogs.txt"))
data.news<- readLines(file.path(cdr, "en_US.news.txt"))
data.twitter<- readLines(file.path(cdr,"en_US.twitter.txt") ,skipNul = T)

examining the data

Source <- c("blogs", "news", "twitter")
Size.MB <- c(file.size(file.path(cdr, "en_US.blogs.txt")) /1024^2,file.size(file.path(cdr, "en_US.news.txt"))/1024^2,file.size(file.path(cdr, "en_US.twitter.txt"))/1024^2)
Number.linesK <- c(length(data.blogs)/1e3,length(data.news)/1e3,length(data.twitter)/1e3)
Number.wordsK <- c(wordcount(data.blogs)/1e3,wordcount(data.news)/1e3,wordcount(data.twitter)/1e3)
data.frame(Source, Size.MB, Number.linesK, Number.wordsK)
##    Source  Size.MB Number.linesK Number.wordsK
## 1   blogs 200.4242       899.288      37334.13
## 2    news 196.2775      1010.242      34372.53
## 3 twitter 159.3641      2360.148      30373.58

Word Frequencies

## Sampling data
set.seed(123)
sample_rate<-0.003
data.sample <- c(sample(data.blogs,length(data.blogs) *sample_rate),sample(data.news,   length(data.news)* sample_rate),sample(data.twitter,length(data.twitter) * sample_rate))
## Creating corpus and cleaning data
options(mc.cores=2)
corpus <- VCorpus(VectorSource(data.sample))
## Warning in as.POSIXlt.POSIXct(Sys.time(), tz = "GMT"): unknown timezone
## 'default/Asia/Tokyo'
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
## extracting n-grams
UniToken<-function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiToken<-function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriToken<-function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
### creating UniGram
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize=UniToken))
fm1<-rowSums(as.matrix(tdm1))
UniGram<-data.frame(word=names(fm1),freq=fm1)
UniGram<-UniGram[order(-UniGram$freq),]
head(UniGram)
##      word freq
## will will  979
## just just  962
## said said  887
## one   one  828
## like like  764
## can   can  745
### creating BiGram
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize=BiToken))
fm2<-rowSums(as.matrix(tdm2))
BiGram<-data.frame(word=names(fm2),freq=fm2)
BiGram<-BiGram[order(-BiGram$freq),]      
head(BiGram)
##                    word freq
## right now     right now   90
## last year     last year   60
## new york       new york   58
## last night   last night   49
## high school high school   43
## years ago     years ago   41
### creating TriGram
tdm3<-removeSparseTerms(TermDocumentMatrix(corpus,control = list(tokenize=TriToken)),0.9999)
fm3<-rowSums(as.matrix(tdm3))
TriGram<-data.frame(word=names(fm3),freq=fm3)
TriGram<-TriGram[order(-TriGram$freq),]        
head(TriGram)
##                                word freq
## happy mothers day happy mothers day   12
## new york city         new york city   10
## happy new year       happy new year    8
## osama bin laden     osama bin laden    6
## new years eve         new years eve    5
## cinco de mayo         cinco de mayo    4

Ploting n-grams

make_plot <- function(data, label) {
  ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
         labs(x = label, y = "Frequency") +
         theme(axis.text.x = element_text(angle = 90, size = 11, hjust = 1)) +
         geom_bar(stat = "identity")}
make_plot(UniGram, "Unigrams")

make_plot(BiGram, "Bigrams")

make_plot(TriGram, "Trigrams")

There are very common word in Unigram(word appears much more often then other). This characterestics become less extreme for Bigram and very weak in Trigram.

Coverae by n-grams

Examine word coverage for 50% and 90%, then plot number of word by chaning coverage. Same analysis for trigram.

coverage<-function(x,cover_rate) #x is the unigram sorted by frequency, cover_rate is the percent word coverage
  {nwords<-0 # initial counter
  cover<-cover_rate*sum(x$freq) # number of words to hit coverage
  for (i in 1:nrow(x))
    {if (nwords >= cover) {return (i)}
    nwords<-nwords+x$freq[i]
  }}
coverage(UniGram,0.5)
## [1] 967
coverage(UniGram,0.9)
## [1] 12024

visualize coverage and word required

x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(UniGram, i)}

qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of words", main="Number of words required to attain coverage")

x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(BiGram, i)}

qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of BiGram", main="Number of BiGram required to attain coverage")

The curve of number of words by coverage, suggest there is some trade-off to consider; coverage and memory utilization. This would be considered in the later predictive analysis.

Other questions

How do you evaluate how many of the words come from foreign languages?

This may be done with a pure English word corpus (a large dictionary). If a word in our corpus is not in that dictionary, they would be most likely one with foreign origin. But our corpus may include some “new word” that is an English word but not yet in dictionary. ###Can you think of a way to increase the coverage? As we are using three sources (news, twitter,blog) that uses words that could be biased for general SwiftKey user. Adding other general souce (some literature, papers) for creating corpus.

Plan for creating predictive model

In coming predictive model building as a Shiny appliction, I would cosider the followings: - Efficiency in memory utilization (think Markov Chains).