As noted earlier, a corpus is a body of text from which we build and test LMs.

rm(list = ls())
library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

First, we read in the text files…

blogs <- "final/en_US/en_US.blogs.txt"
blog_line <- readLines(blogs,encoding="UTF-8", skipNul = TRUE)

news <- "final/en_US/en_US.news.txt"
news_line <- readLines(news,encoding="UTF-8", skipNul = TRUE)

twitter <- "final/en_US/en_US.twitter.txt"
twitter_line <- readLines(twitter,encoding="UTF-8", skipNul = TRUE)

Create a sample dataset using 25% each of the .txt files

set.seed(42)

samplecorpus <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.4)],news_line[sample(1:length(news_line),length(news_line)*0.4)],twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.4)])

Delete blog_line,news_line and twitter_line files to free up memory since we don’t need them anymore.

rm(list=c("blog_line","news_line","twitter_line"))

Step 1. i. Unigram, Bigram and Trigram counts

We will use the quanteda package to construct the n-gram tables. Many data scientists say it performs much faster than tm and RWeka for these types of tasks.

Create a corpus of words and then clean it using the quanteda package.

sample_corpus <-  corpus(samplecorpus)
sample_corpus_tokens <- tokens(sample_corpus,what = "word",
                        remove_punct = TRUE,
                        remove_symbols = TRUE,
                        remove_numbers = TRUE,
                        remove_url = TRUE,
                        remove_separators = TRUE,
                        split_hyphens = FALSE,
                        include_docvars = TRUE,
                        padding = FALSE)
sample_corpus_tokens <- tokens_tolower(sample_corpus_tokens)
# sample_corpus_tokens <- tokens_wordstem(sample_corpus_tokens, language = quanteda_options("language_stemmer"))
# sample_corpus_tokens <- tokens_select(sample_corpus_tokens, pattern = stopwords("en"), selection = "remove")

Delete samplecorpusand sample_corpus files to free up memory since we don’t need them anymore.

rm(list=c("samplecorpus","sample_corpus"))

Get frequencies of n-grams from the sample corpus…

dfm_sample_corpus_tokens <- dfm(sample_corpus_tokens)
unigrams_freq <- textstat_frequency(dfm_sample_corpus_tokens)  # unigram frequency
unigrs <- subset(unigrams_freq,select=c(feature,frequency))
names(unigrs) <- c("ngram","freq")
unigrs <- as.data.table(unigrs)
fwrite(unigrs,"unigrs.csv")

bigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 2))
bigrams_freq <- textstat_frequency(bigrams)                    # bigram frequency
bigrs <- subset(bigrams_freq,select=c(feature,frequency))
names(bigrs) <- c("ngram","freq")
bigrs <- as.data.table(bigrs)
fwrite(bigrs,"bigrs.csv")

trigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 3))
trigrams_freq <- textstat_frequency(trigrams)                  # trigram frequency
trigrs <- subset(trigrams_freq,select=c(feature,frequency))
names(trigrs) <- c("ngram","freq")
trigrs <- as.data.table(trigrs)
fwrite(trigrs,"trigrs.csv")

head(unigrs);head(bigrs);head(trigrs)
##    ngram    freq
## 1:   the 1906603
## 2:    to 1101253
## 3:   and  965424
## 4:     a  952977
## 5:    of  800580
## 6:     i  662225
##      ngram   freq
## 1:  of_the 172002
## 2:  in_the 164632
## 3:  to_the  85441
## 4: for_the  80295
## 5:  on_the  78740
## 6:   to_be  64856
##             ngram  freq
## 1:     one_of_the 13776
## 2:       a_lot_of 11981
## 3: thanks_for_the  9567
## 4:        to_be_a  7376
## 5:    going_to_be  7033
## 6:     out_of_the  6033

Delete dfm_sample_corpus_tokens, sample_corpus_tokens, and the n-gram frequency files to free up memory since we don’t need them anymore.

rm(list=c("dfm_sample_corpus_tokens","sample_corpus_tokens","unigrams_freq","bigrams_freq","trigrams_freq","bigrams","trigrams"))