Explore the whole data and sample

As shown in the table below, the whole data is very big and processing it in R is time-consuming. Due to the big size of the raw data, I decide to sample 1% of the rows for further exploratory analyses. The summary of the sample data in terms of number of lines and words are also reported in the same table.

# load data
mydir <- "./en_US"
sample_dir <- "./en_US_sample"
sample_ratio <- 0.01
fnv <- vector()
ln <- vector()
wn <- vector()
sln <- vector()
swn <- vector()
set.seed(321)
for (fn in list.files(mydir)){
    # print(paste("Sampling ", fn))
    fnv <- c(fnv, fn)
    fcontents <- readLines(paste0(mydir, "/", fn), encoding = 'UTF-8')
    nlines <- length(fcontents)
    ln <- c(ln, nlines)
    nwords <- length(unlist(sapply(fcontents, strsplit, split="\\s+", fixed=FALSE), use.names = FALSE))
    wn <- c(wn, nwords)
    # sample_index <- rbinom(nlines, 1, prob=sample_ratio)
    sample_01 <- rbinom(nlines, 1, prob=sample_ratio)
    sample_index <- which(as.logical(sample_01))
    snlines <- sum(sample_01)
    # print(paste(snlines, "out of", nlines, "lines selected"))
    sln <- c(sln, snlines)
    swn <- c(swn, length(unlist(sapply(fcontents[sample_index], strsplit, split="\\s+", fixed=FALSE), use.names = FALSE)))
    writeLines(fcontents[sample_index], con=paste0(sample_dir, "/", fn))
}
# make a table of the summary
kable(data.frame(FileName = fnv, NumOfLines = ln, NumOfWords = wn, SampleLines = sln, SampleWords = swn))

FileName	NumOfLines	NumOfWords	SampleLines	SampleWords
en_US.blogs.txt	899288	37334149	9095	376697
en_US.news.txt	1010242	34372814	10045	342599
en_US.twitter.txt	2360148	30373565	23953	309157

Exploratoray Analyses

Next, we explore the sampled data to gain more insights about the text files.

# load the sampled 1% of whole data
sample_dir <- "./en_US_sample"
text <- vector()
for (fn in list.files(sample_dir)){
    text <- c(text, scan(file=paste0(sample_dir, "/", fn), what="c"))
}
# length(text)

# change all to lower case
text <- tolower(text) 

# paste to form 2Gram and 3Gram from 1Gram
unigram <- text
text2 <- c(text[-1], ".")
bigram <- paste(text, text2)
text3 <- c(text2[-1], ".")
trigram <- paste(text, text2, text3)

# Frequency count
freq1 <- sort(table(unigram), decreasing = T)
    
unigram.dist <- data.frame(
    words = names(freq1), 
    freq = as.numeric(freq1), 
    ratio = as.numeric(freq1)/sum(freq1),
    cumsum_ratio = cumsum(as.numeric(freq1)/sum(freq1)))

# 50% or 90% coverage
# number of unique words
nrow(unigram.dist)

## [1] 103306

# number of words needed to cover 50% of the document
unigram.dist[min(which(unigram.dist$cumsum_ratio > 0.5)),]

##       words freq        ratio cumsum_ratio
## 219 against  385 0.0004117246    0.5002422

# number of words needed to cover 90% of the document
unigram.dist[min(which(unigram.dist$cumsum_ratio > 0.9)),]

##         words freq        ratio cumsum_ratio
## 22919 throws,    3 3.208244e-06    0.9000001

# table and barplot of the unigram
kable(unigram.dist[1:10,])

words	freq	ratio	cumsum_ratio
the	43177	0.0461741	0.0461741
to	24558	0.0262627	0.0724368
and	21427	0.0229143	0.0953511
a	21401	0.0228865	0.1182377
of	18006	0.0192559	0.1374936
in	14801	0.0158284	0.1533220
i	14463	0.0154669	0.1687889
for	9945	0.0106353	0.1794242
is	9394	0.0100461	0.1894703
that	8852	0.0094665	0.1989368

par(las=2) # make label text perpendicular to axis
par(mar=c(5,8,1,1))
barplot(freq1[1:10], horiz=TRUE, main="Top 10 Unigrams")

freq2 <- sort(table(bigram), decreasing = T)
# head(freq2)
bigram.dist <- data.frame(
    words = names(freq2), 
    freq = as.numeric(freq2), 
    ratio = as.numeric(freq2)/sum(freq2),
    cumsum_ratio = cumsum(as.numeric(freq2)/sum(freq2)))
kable(bigram.dist[1:10,])

words	freq	ratio	cumsum_ratio
of the	3822	0.0040873	0.0040873
in the	3720	0.0039782	0.0080655
to the	1929	0.0020629	0.0101284
for the	1855	0.0019838	0.0121122
on the	1722	0.0018415	0.0139537
at the	1407	0.0015047	0.0154584
to be	1406	0.0015036	0.0169620
and the	1095	0.0011710	0.0181330
in a	1030	0.0011015	0.0192345
with the	961	0.0010277	0.0202622

barplot(freq2[1:10], horiz=TRUE, main="Top 10 Bigrams")

freq3 <- sort(table(trigram), decreasing = T)
# head(freq3)
trigram.dist <- data.frame(
    words = names(freq3), 
    freq = as.numeric(freq3), 
    ratio = as.numeric(freq3)/sum(freq3),
    cumsum_ratio = cumsum(as.numeric(freq3)/sum(freq3)))
kable(trigram.dist[1:10,])

words	freq	ratio	cumsum_ratio
one of the	308	0.0003294	0.0003294
thanks for the	230	0.0002460	0.0005753
a lot of	217	0.0002321	0.0008074
to be a	174	0.0001861	0.0009935
the end of	147	0.0001572	0.0011507
going to be	144	0.0001540	0.0013047
as well as	142	0.0001519	0.0014565
out of the	142	0.0001519	0.0016084
i want to	128	0.0001369	0.0017453
it was a	127	0.0001358	0.0018811

barplot(freq3[1:10], horiz=TRUE, main="Top 10 Trigrams")

Interesting Findings

We need only 0.2% (219/103306) of the unique words to cover 50% of the document, and 2.2% (22919/103306) of the unique words to cover 90% of the docuemnts. It’s clear that some words are more frequent than others in the document.
The top words appeared in the sample document are mostly stop words like (the, a). Further preprocessing steps are needed to remove the stop words adn extract more meaningful words. For the moment, I utilize one preprocessing by changing the string to lower case using tolower function.
The 1-Gram and 2-Gram models could be built by using the frequency count of unigram, bigram, trigram count following the references. So far, I tabulate the count and probability for 1-gram, 2-gram, and 3-gram. I need to calculate the conditional probability based on Bayes formula in the near future.

Next Steps

Remove stopwords and stem the document as explained in the Findings section.
The Backoff model for dealing with unseen n-grams. Though a easier but less accurate way of solving the issue would be Laplace smoothing.
Explore the RWeka package that can build and save the NGram models.

References

https://english.boisestate.edu/johnfry/files/2013/04/bigram-2x2.pdf

Milestone Report

Cheng Nie

September 4, 2016

Explore the whole data and sample

Exploratoray Analyses

Interesting Findings

Next Steps

References