Explore the whole data and sample
As shown in the table below, the whole data is very big and processing it in R is time-consuming. Due to the big size of the raw data, I decide to sample 1% of the rows for further exploratory analyses. The summary of the sample data in terms of number of lines and words are also reported in the same table.
# load data
mydir <- "./en_US"
sample_dir <- "./en_US_sample"
sample_ratio <- 0.01
fnv <- vector()
ln <- vector()
wn <- vector()
sln <- vector()
swn <- vector()
set.seed(321)
for (fn in list.files(mydir)){
# print(paste("Sampling ", fn))
fnv <- c(fnv, fn)
fcontents <- readLines(paste0(mydir, "/", fn), encoding = 'UTF-8')
nlines <- length(fcontents)
ln <- c(ln, nlines)
nwords <- length(unlist(sapply(fcontents, strsplit, split="\\s+", fixed=FALSE), use.names = FALSE))
wn <- c(wn, nwords)
# sample_index <- rbinom(nlines, 1, prob=sample_ratio)
sample_01 <- rbinom(nlines, 1, prob=sample_ratio)
sample_index <- which(as.logical(sample_01))
snlines <- sum(sample_01)
# print(paste(snlines, "out of", nlines, "lines selected"))
sln <- c(sln, snlines)
swn <- c(swn, length(unlist(sapply(fcontents[sample_index], strsplit, split="\\s+", fixed=FALSE), use.names = FALSE)))
writeLines(fcontents[sample_index], con=paste0(sample_dir, "/", fn))
}
# make a table of the summary
kable(data.frame(FileName = fnv, NumOfLines = ln, NumOfWords = wn, SampleLines = sln, SampleWords = swn))
| en_US.blogs.txt |
899288 |
37334149 |
9095 |
376697 |
| en_US.news.txt |
1010242 |
34372814 |
10045 |
342599 |
| en_US.twitter.txt |
2360148 |
30373565 |
23953 |
309157 |
Exploratoray Analyses
Next, we explore the sampled data to gain more insights about the text files.
# load the sampled 1% of whole data
sample_dir <- "./en_US_sample"
text <- vector()
for (fn in list.files(sample_dir)){
text <- c(text, scan(file=paste0(sample_dir, "/", fn), what="c"))
}
# length(text)
# change all to lower case
text <- tolower(text)
# paste to form 2Gram and 3Gram from 1Gram
unigram <- text
text2 <- c(text[-1], ".")
bigram <- paste(text, text2)
text3 <- c(text2[-1], ".")
trigram <- paste(text, text2, text3)
# Frequency count
freq1 <- sort(table(unigram), decreasing = T)
unigram.dist <- data.frame(
words = names(freq1),
freq = as.numeric(freq1),
ratio = as.numeric(freq1)/sum(freq1),
cumsum_ratio = cumsum(as.numeric(freq1)/sum(freq1)))
# 50% or 90% coverage
# number of unique words
nrow(unigram.dist)
## [1] 103306
# number of words needed to cover 50% of the document
unigram.dist[min(which(unigram.dist$cumsum_ratio > 0.5)),]
## words freq ratio cumsum_ratio
## 219 against 385 0.0004117246 0.5002422
# number of words needed to cover 90% of the document
unigram.dist[min(which(unigram.dist$cumsum_ratio > 0.9)),]
## words freq ratio cumsum_ratio
## 22919 throws, 3 3.208244e-06 0.9000001
# table and barplot of the unigram
kable(unigram.dist[1:10,])
| the |
43177 |
0.0461741 |
0.0461741 |
| to |
24558 |
0.0262627 |
0.0724368 |
| and |
21427 |
0.0229143 |
0.0953511 |
| a |
21401 |
0.0228865 |
0.1182377 |
| of |
18006 |
0.0192559 |
0.1374936 |
| in |
14801 |
0.0158284 |
0.1533220 |
| i |
14463 |
0.0154669 |
0.1687889 |
| for |
9945 |
0.0106353 |
0.1794242 |
| is |
9394 |
0.0100461 |
0.1894703 |
| that |
8852 |
0.0094665 |
0.1989368 |
par(las=2) # make label text perpendicular to axis
par(mar=c(5,8,1,1))
barplot(freq1[1:10], horiz=TRUE, main="Top 10 Unigrams")

freq2 <- sort(table(bigram), decreasing = T)
# head(freq2)
bigram.dist <- data.frame(
words = names(freq2),
freq = as.numeric(freq2),
ratio = as.numeric(freq2)/sum(freq2),
cumsum_ratio = cumsum(as.numeric(freq2)/sum(freq2)))
kable(bigram.dist[1:10,])
| of the |
3822 |
0.0040873 |
0.0040873 |
| in the |
3720 |
0.0039782 |
0.0080655 |
| to the |
1929 |
0.0020629 |
0.0101284 |
| for the |
1855 |
0.0019838 |
0.0121122 |
| on the |
1722 |
0.0018415 |
0.0139537 |
| at the |
1407 |
0.0015047 |
0.0154584 |
| to be |
1406 |
0.0015036 |
0.0169620 |
| and the |
1095 |
0.0011710 |
0.0181330 |
| in a |
1030 |
0.0011015 |
0.0192345 |
| with the |
961 |
0.0010277 |
0.0202622 |
barplot(freq2[1:10], horiz=TRUE, main="Top 10 Bigrams")

freq3 <- sort(table(trigram), decreasing = T)
# head(freq3)
trigram.dist <- data.frame(
words = names(freq3),
freq = as.numeric(freq3),
ratio = as.numeric(freq3)/sum(freq3),
cumsum_ratio = cumsum(as.numeric(freq3)/sum(freq3)))
kable(trigram.dist[1:10,])
| one of the |
308 |
0.0003294 |
0.0003294 |
| thanks for the |
230 |
0.0002460 |
0.0005753 |
| a lot of |
217 |
0.0002321 |
0.0008074 |
| to be a |
174 |
0.0001861 |
0.0009935 |
| the end of |
147 |
0.0001572 |
0.0011507 |
| going to be |
144 |
0.0001540 |
0.0013047 |
| as well as |
142 |
0.0001519 |
0.0014565 |
| out of the |
142 |
0.0001519 |
0.0016084 |
| i want to |
128 |
0.0001369 |
0.0017453 |
| it was a |
127 |
0.0001358 |
0.0018811 |
barplot(freq3[1:10], horiz=TRUE, main="Top 10 Trigrams")

Interesting Findings
- We need only 0.2% (219/103306) of the unique words to cover 50% of the document, and 2.2% (22919/103306) of the unique words to cover 90% of the docuemnts. It’s clear that some words are more frequent than others in the document.
- The top words appeared in the sample document are mostly stop words like (the, a). Further preprocessing steps are needed to remove the stop words adn extract more meaningful words. For the moment, I utilize one preprocessing by changing the string to lower case using tolower function.
- The 1-Gram and 2-Gram models could be built by using the frequency count of unigram, bigram, trigram count following the references. So far, I tabulate the count and probability for 1-gram, 2-gram, and 3-gram. I need to calculate the conditional probability based on Bayes formula in the near future.
Next Steps
- Remove stopwords and stem the document as explained in the Findings section.
- The Backoff model for dealing with unseen n-grams. Though a easier but less accurate way of solving the issue would be Laplace smoothing.
- Explore the RWeka package that can build and save the NGram models.