Our final purpose is to build a predicive model for text.
The first step is understanding the distribution and relationship between the words, tokens, and phrases in the text.
NB : because of Knitr memory problem, not all data have been processed.
##install.packages("stringr")
##library(stringr)
##install.packages("tm")
##library(tm)
##install.packages("RWeka")
##library(RWeka)
mydata1 = read.table("en_US.blogs.txt", sep="\t", header=TRUE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : Fin de fichier (EOF) dans une chaîne de caractères entre guillements
dim(mydata1) ## 299 999 rows
## [1] 299999 1
str_length(mydata1) ## 2 293 365
## [1] 2293365
##mydata2 = read.table("en_US.news.txt", sep="\t", header=TRUE)
##dim(mydata2) ## 25 353 rows
##str_length(mydata2) ## 166 696
##mydata3 = read.table("en_US.twitter.txt", sep="\t", header=TRUE)
##dim(mydata3) ## 829 086 rows
##str_length(mydata3) ## 6 532868
We have 2 solutions :
- merge the 3 sources => pros : more text volume, thus more accuracy in the final models
- analyze each source separately => pros : we will have models for each context. We should keep in mind the final purpose (if it is predicting the next sms word, “blogs” data will be more useful than “news” data…)
First, in order to speed up exploratory analysis, we extract a sample data.
Then we count word frequencies :
##data1 <- Corpus(VectorSource(mydata1))
## We extract sample data
##explo <- mydata1[1:100,1]
##explo <- Corpus(VectorSource(explo))
##BigramTokenizer <-
##function(x)
##unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
##tdm <- TermDocumentMatrix(explo, control = list(tokenize = BigramTokenizer))
##temp <- inspect(tdm)
##FreqMat_uni <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_uni) <- NULL
##dim(FreqMat_uni)
##head(FreqMat_uni)
##FreqMat_uni_ordered <- FreqMat_uni[order(FreqMat_uni$Freq,decreasing = TRUE),]
##head(FreqMat_uni_ordered)
##distrib <- NULL
##for (i in 1:100) {distrib[i] <- FreqMat_uni_ordered$Freq[i]}
load("distrib.Rdata")
plot(distrib, xlab="Words ranked by decreasing frequency",ylab="Frequency",main="Distribution of words", type="o")
load("FreqMat_uni_ordered.Rdata")
head(FreqMat_uni_ordered)
## ST Freq
## 4525 the 778
## 335 and 463
## 4517 that 211
## 1866 for 151
## 5022 with 147
## 4909 was 125
We proceed the same way, with bigrams.
##BigramTokenizer <-
##function(x)
##unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
##tdm <- TermDocumentMatrix(explo, control = list(tokenize = BigramTokenizer))
##temp <- inspect(tdm)
##FreqMat_bi <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_bi) <- NULL
##dim(FreqMat_bi)
##head(FreqMat_bi)
##FreqMat_bi_ordered <- FreqMat_bi[order(FreqMat_bi$Freq,decreasing = TRUE),]
##head(FreqMat_bi_ordered)
##distrib2 <- NULL
##for (i in 1:100) {distrib2[i] <- FreqMat_bi_ordered$Freq[i]}
load("distrib2.Rdata")
plot(distrib2, xlab="Bigrams ranked by decreasing frequency",ylab="Frequency",main="Distribution of bigrams", type="o")
load("FreqMat_bi_ordered.Rdata")
head(FreqMat_bi_ordered)
## ST Freq
## 5617 in the 63
## 7755 of the 61
## 11677 to the 41
## 7893 on the 34
## 1189 and i 33
## 11457 to be 28
We proceed the same way, with trigrams.
##trigramTokenizer <-
##function(x)
##unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
##tdm <- TermDocumentMatrix(explo, control = list(tokenize = trigramTokenizer))
##temp <- inspect(tdm)
##FreqMat_tri <- data.frame(ST = rownames(temp), Freq = rowSums(temp))
##row.names(FreqMat_tri) <- NULL
##dim(FreqMat_tri)
##head(FreqMat_tri)
##FreqMat_tri_ordered <- FreqMat_tri[order(FreqMat_tri$Freq,decreasing = TRUE),]
##head(FreqMat_tri_ordered)
##distrib3 <- NULL
##for (i in 1:100) {distrib3[i] <- FreqMat_tri_ordered$Freq[i]}
load("distrib3.Rdata")
plot(distrib3, xlab="trigrams ranked by decreasing frequency",ylab="Frequency",main="Distribution of trigrams", type="o")
load("FreqMat_tri_ordered.Rdata")
head(FreqMat_tri_ordered)
## ST Freq
## 422 a lot of 10
## 6101 i had to 7
## 6026 i decided to 6
## 9422 one of the 6
## 1891 as much as 5
## 6091 i had a 5
If we summarize, we have calculated frequencies :
- Frequency distribution for unigrams ;
- Frequency distribution for bigrams ;
- Frequency distribution for trigrams ;
From these frequency we can derive probabilities :
- Probabilities of unigrams ;
- Probabilities of bigrams ;
- Probabilities of trigrams ;
From these probabilities we can predict the next word:
- By choosing the max trigram probability (maximum likelyhood) ;
- Taking into account also the bigram probability (maximum likelyhood).