As noted earlier, a corpus is a body of text from which we build and test LMs.
rm(list = ls())
library(quanteda)
## Package version: 2.1.2
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
blogs <- "final/en_US/en_US.blogs.txt"
blog_line <- readLines(blogs,encoding="UTF-8", skipNul = TRUE)
news <- "final/en_US/en_US.news.txt"
news_line <- readLines(news,encoding="UTF-8", skipNul = TRUE)
twitter <- "final/en_US/en_US.twitter.txt"
twitter_line <- readLines(twitter,encoding="UTF-8", skipNul = TRUE)
.txt filesset.seed(42)
samplecorpus <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.4)],news_line[sample(1:length(news_line),length(news_line)*0.4)],twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.4)])
blog_line,news_line and twitter_line files to free up memory since we don’t need them anymore.rm(list=c("blog_line","news_line","twitter_line"))
We will use the quanteda package to construct the n-gram tables. Many data scientists say it performs much faster than tm and RWeka for these types of tasks.
quanteda package.sample_corpus <- corpus(samplecorpus)
sample_corpus_tokens <- tokens(sample_corpus,what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
split_hyphens = FALSE,
include_docvars = TRUE,
padding = FALSE)
sample_corpus_tokens <- tokens_tolower(sample_corpus_tokens)
# sample_corpus_tokens <- tokens_wordstem(sample_corpus_tokens, language = quanteda_options("language_stemmer"))
# sample_corpus_tokens <- tokens_select(sample_corpus_tokens, pattern = stopwords("en"), selection = "remove")
samplecorpusand sample_corpus files to free up memory since we don’t need them anymore.rm(list=c("samplecorpus","sample_corpus"))
dfm_sample_corpus_tokens <- dfm(sample_corpus_tokens)
unigrams_freq <- textstat_frequency(dfm_sample_corpus_tokens) # unigram frequency
unigrs <- subset(unigrams_freq,select=c(feature,frequency))
names(unigrs) <- c("ngram","freq")
unigrs <- as.data.table(unigrs)
fwrite(unigrs,"unigrs.csv")
bigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 2))
bigrams_freq <- textstat_frequency(bigrams) # bigram frequency
bigrs <- subset(bigrams_freq,select=c(feature,frequency))
names(bigrs) <- c("ngram","freq")
bigrs <- as.data.table(bigrs)
fwrite(bigrs,"bigrs.csv")
trigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 3))
trigrams_freq <- textstat_frequency(trigrams) # trigram frequency
trigrs <- subset(trigrams_freq,select=c(feature,frequency))
names(trigrs) <- c("ngram","freq")
trigrs <- as.data.table(trigrs)
fwrite(trigrs,"trigrs.csv")
head(unigrs);head(bigrs);head(trigrs)
## ngram freq
## 1: the 1906603
## 2: to 1101253
## 3: and 965424
## 4: a 952977
## 5: of 800580
## 6: i 662225
## ngram freq
## 1: of_the 172002
## 2: in_the 164632
## 3: to_the 85441
## 4: for_the 80295
## 5: on_the 78740
## 6: to_be 64856
## ngram freq
## 1: one_of_the 13776
## 2: a_lot_of 11981
## 3: thanks_for_the 9567
## 4: to_be_a 7376
## 5: going_to_be 7033
## 6: out_of_the 6033
dfm_sample_corpus_tokens, sample_corpus_tokens, and the n-gram frequency files to free up memory since we don’t need them anymore.rm(list=c("dfm_sample_corpus_tokens","sample_corpus_tokens","unigrams_freq","bigrams_freq","trigrams_freq","bigrams","trigrams"))