library(tm)
## Loading required package: NLP
library(RWeka)
library(ngram)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
cdr<-"~/Dropbox/Univ/Coursera/JH_R/JH_Data_Science/10-Capstone/final/en_US"
### reading files
data.blogs<- readLines(file.path(cdr,"en_US.blogs.txt"))
data.news<- readLines(file.path(cdr, "en_US.news.txt"))
data.twitter<- readLines(file.path(cdr,"en_US.twitter.txt") ,skipNul = T)
Source <- c("blogs", "news", "twitter")
Size.MB <- c(file.size(file.path(cdr, "en_US.blogs.txt")) /1024^2,file.size(file.path(cdr, "en_US.news.txt"))/1024^2,file.size(file.path(cdr, "en_US.twitter.txt"))/1024^2)
Number.linesK <- c(length(data.blogs)/1e3,length(data.news)/1e3,length(data.twitter)/1e3)
Number.wordsK <- c(wordcount(data.blogs)/1e3,wordcount(data.news)/1e3,wordcount(data.twitter)/1e3)
data.frame(Source, Size.MB, Number.linesK, Number.wordsK)
## Source Size.MB Number.linesK Number.wordsK
## 1 blogs 200.4242 899.288 37334.13
## 2 news 196.2775 1010.242 34372.53
## 3 twitter 159.3641 2360.148 30373.58
## Sampling data
set.seed(123)
sample_rate<-0.003
data.sample <- c(sample(data.blogs,length(data.blogs) *sample_rate),sample(data.news, length(data.news)* sample_rate),sample(data.twitter,length(data.twitter) * sample_rate))
## Creating corpus and cleaning data
options(mc.cores=2)
corpus <- VCorpus(VectorSource(data.sample))
## Warning in as.POSIXlt.POSIXct(Sys.time(), tz = "GMT"): unknown timezone
## 'default/Asia/Tokyo'
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
## extracting n-grams
UniToken<-function(x)NGramTokenizer(x, Weka_control(min = 1, max = 1))
BiToken<-function(x)NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriToken<-function(x)NGramTokenizer(x, Weka_control(min = 3, max = 3))
### creating UniGram
tdm1<-TermDocumentMatrix(corpus,control = list(tokenize=UniToken))
fm1<-rowSums(as.matrix(tdm1))
UniGram<-data.frame(word=names(fm1),freq=fm1)
UniGram<-UniGram[order(-UniGram$freq),]
head(UniGram)
## word freq
## will will 979
## just just 962
## said said 887
## one one 828
## like like 764
## can can 745
### creating BiGram
tdm2<-TermDocumentMatrix(corpus,control = list(tokenize=BiToken))
fm2<-rowSums(as.matrix(tdm2))
BiGram<-data.frame(word=names(fm2),freq=fm2)
BiGram<-BiGram[order(-BiGram$freq),]
head(BiGram)
## word freq
## right now right now 90
## last year last year 60
## new york new york 58
## last night last night 49
## high school high school 43
## years ago years ago 41
### creating TriGram
tdm3<-removeSparseTerms(TermDocumentMatrix(corpus,control = list(tokenize=TriToken)),0.9999)
fm3<-rowSums(as.matrix(tdm3))
TriGram<-data.frame(word=names(fm3),freq=fm3)
TriGram<-TriGram[order(-TriGram$freq),]
head(TriGram)
## word freq
## happy mothers day happy mothers day 12
## new york city new york city 10
## happy new year happy new year 8
## osama bin laden osama bin laden 6
## new years eve new years eve 5
## cinco de mayo cinco de mayo 4
make_plot <- function(data, label) {
ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 90, size = 11, hjust = 1)) +
geom_bar(stat = "identity")}
make_plot(UniGram, "Unigrams")
make_plot(BiGram, "Bigrams")
make_plot(TriGram, "Trigrams")
There are very common word in Unigram(word appears much more often then other). This characterestics become less extreme for Bigram and very weak in Trigram.
coverage<-function(x,cover_rate) #x is the unigram sorted by frequency, cover_rate is the percent word coverage
{nwords<-0 # initial counter
cover<-cover_rate*sum(x$freq) # number of words to hit coverage
for (i in 1:nrow(x))
{if (nwords >= cover) {return (i)}
nwords<-nwords+x$freq[i]
}}
coverage(UniGram,0.5)
## [1] 967
coverage(UniGram,0.9)
## [1] 12024
x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(UniGram, i)}
qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of words", main="Number of words required to attain coverage")
x <- seq(0.1, 0.9, by = 0.1)
y <- c()
for(i in x){y[i*10] <- coverage(BiGram, i)}
qplot(x, y, geom = c("line"), xlab="Coverage", ylab="Number of BiGram", main="Number of BiGram required to attain coverage")
The curve of number of words by coverage, suggest there is some trade-off to consider; coverage and memory utilization. This would be considered in the later predictive analysis.
This may be done with a pure English word corpus (a large dictionary). If a word in our corpus is not in that dictionary, they would be most likely one with foreign origin. But our corpus may include some “new word” that is an English word but not yet in dictionary. ###Can you think of a way to increase the coverage? As we are using three sources (news, twitter,blog) that uses words that could be biased for general SwiftKey user. Adding other general souce (some literature, papers) for creating corpus.
In coming predictive model building as a Shiny appliction, I would cosider the followings: - Efficiency in memory utilization (think Markov Chains).
Using knowledge on word frequencies to balance coverage and efficiency, using either number of word or coverage as a parameter.
Smoothings to be considered.
Model accuracy with some test sentences (from newspaper, etc.)for evaluation.
Use some flas to check if any n-grams is not incorpus(even with the smoothing).
Memory utilization of the model and time should be measured.