Objectives

Our final purpose is to build a predicive model for text.
The first step is understanding the distribution and relationship between the words, tokens, and phrases in the text.

1/ Load and explore data

NB : because of Knitr memory problem, not all data have been processed.

##install.packages("stringr")
##library(stringr)

##install.packages("tm")
##library(tm)

##install.packages("RWeka")
##library(RWeka)

mydata1 = read.table("en_US.blogs.txt", sep="\t", header=TRUE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : Fin de fichier (EOF) dans une chaîne de caractères entre guillements
dim(mydata1) ## 299 999 rows
## [1] 299999      1
str_length(mydata1) ## 2 293 365
## [1] 2293365
##mydata2 = read.table("en_US.news.txt", sep="\t", header=TRUE)
##dim(mydata2) ## 25 353 rows
##str_length(mydata2) ## 166 696

##mydata3 = read.table("en_US.twitter.txt", sep="\t", header=TRUE)
##dim(mydata3) ## 829 086 rows
##str_length(mydata3) ## 6 532868

2/ Should we merge the 3 sources ? (twitter, blogs, news)

We have 2 solutions :
- merge the 3 sources => pros : more text volume, thus more accuracy in the final models
- analyze each source separately => pros : we will have models for each context. We should keep in mind the final purpose (if it is predicting the next sms word, “blogs” data will be more useful than “news” data…)

3/ Distribution of unigrams : 90/10 pareto rule

First, in order to speed up exploratory analysis, we extract a sample data.
Then we count word frequencies :

##data1  <- Corpus(VectorSource(mydata1))

## We extract sample data
##explo <- mydata1[1:100,1]
##explo <- Corpus(VectorSource(explo))

##BigramTokenizer <-
  ##function(x)
    ##unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)

##tdm <- TermDocumentMatrix(explo, control = list(tokenize = BigramTokenizer))

##temp <- inspect(tdm)
##FreqMat_uni <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_uni) <- NULL
##dim(FreqMat_uni)
##head(FreqMat_uni)

##FreqMat_uni_ordered <- FreqMat_uni[order(FreqMat_uni$Freq,decreasing = TRUE),]
##head(FreqMat_uni_ordered)
##distrib <- NULL
##for (i in 1:100) {distrib[i] <- FreqMat_uni_ordered$Freq[i]}
load("distrib.Rdata")
plot(distrib, xlab="Words ranked by decreasing frequency",ylab="Frequency",main="Distribution of words", type="o")

load("FreqMat_uni_ordered.Rdata")
head(FreqMat_uni_ordered)
##        ST Freq
## 4525  the  778
## 335   and  463
## 4517 that  211
## 1866  for  151
## 5022 with  147
## 4909  was  125

4/ Distribution of bigrams

We proceed the same way, with bigrams.

##BigramTokenizer <-
  ##function(x)
    ##unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

##tdm <- TermDocumentMatrix(explo, control = list(tokenize = BigramTokenizer))

##temp <- inspect(tdm)
##FreqMat_bi <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_bi) <- NULL
##dim(FreqMat_bi)
##head(FreqMat_bi)

##FreqMat_bi_ordered <- FreqMat_bi[order(FreqMat_bi$Freq,decreasing = TRUE),]
##head(FreqMat_bi_ordered)
##distrib2 <- NULL
##for (i in 1:100) {distrib2[i] <- FreqMat_bi_ordered$Freq[i]}
load("distrib2.Rdata")
plot(distrib2, xlab="Bigrams ranked by decreasing frequency",ylab="Frequency",main="Distribution of bigrams", type="o")

load("FreqMat_bi_ordered.Rdata")
head(FreqMat_bi_ordered)
##           ST Freq
## 5617  in the   63
## 7755  of the   61
## 11677 to the   41
## 7893  on the   34
## 1189   and i   33
## 11457  to be   28

4/ Distribution of trigrams

We proceed the same way, with trigrams.

##trigramTokenizer <-
  ##function(x)
    ##unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

##tdm <- TermDocumentMatrix(explo, control = list(tokenize = trigramTokenizer))

##temp <- inspect(tdm)
##FreqMat_tri <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_tri) <- NULL
##dim(FreqMat_tri)
##head(FreqMat_tri)

##FreqMat_tri_ordered <- FreqMat_tri[order(FreqMat_tri$Freq,decreasing = TRUE),]
##head(FreqMat_tri_ordered)
##distrib3 <- NULL
##for (i in 1:100) {distrib3[i] <- FreqMat_tri_ordered$Freq[i]}
load("distrib3.Rdata")
plot(distrib3, xlab="trigrams ranked by decreasing frequency",ylab="Frequency",main="Distribution of trigrams", type="o")

load("FreqMat_tri_ordered.Rdata")
head(FreqMat_tri_ordered)
##                ST Freq
## 422      a lot of   10
## 6101     i had to    7
## 6026 i decided to    6
## 9422   one of the    6
## 1891   as much as    5
## 6091      i had a    5

5/ Thoughts about our target model

If we summarize, we have calculated frequencies :
- Frequency distribution for unigrams ;
- Frequency distribution for bigrams ;
- Frequency distribution for trigrams ;
From these frequency we can derive probabilities :
- Probabilities of unigrams ;
- Probabilities of bigrams ;
- Probabilities of trigrams ;
From these probabilities we can predict the next word:
- By choosing the max trigram probability (maximum likelyhood) ;
- Taking into account also the bigram probability (maximum likelyhood).