Capstone Project. N-Gram Generator

Load packages

rm(list = ls())
library(quanteda)
library(data.table)
library(dplyr)
library(stringr)

Load the texxt files

# getwd()
setwd("C:/Users/pruebas/Documents/Capstone_Katz_Back_Off")
# A character vector, each element is a line
blog_line <- readLines("final/en_US/en_US.blogs.txt",encoding="UTF-8", skipNul = TRUE)
news_line <- readLines("final/en_US/en_US.news.txt",encoding="UTF-8", skipNul = TRUE)
twitter_line <- readLines("final/en_US/en_US.twitter.txt",encoding="UTF-8", skipNul = TRUE)

Create a sample dataset using 5% each of the .txt files

set.seed(1977)
samplecorpus <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.05)],news_line[sample(1:length(news_line),length(news_line)*0.05)],twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.05)])

Delete blog_line,news_line and twitter_line files to free up memory since we don’t need them anymore.

rm(list=c("blog_line","news_line","twitter_line"))

Unigram, Bigram and Trigram counts

We will use the quanteda package to construct the n-gram tables.

sample_corpus <-  corpus(samplecorpus)
sample_corpus_tokens <- tokens(sample_corpus,what = "word",
                        remove_punct = TRUE,
                        remove_symbols = TRUE,
                        remove_numbers = TRUE,
                        remove_url = TRUE,
                        remove_separators = TRUE,
                        split_hyphens = FALSE,
                        include_docvars = TRUE,
                        padding = FALSE)
sample_corpus_tokens <- tokens_tolower(sample_corpus_tokens)
sample_corpus_tokens <- tokens_wordstem(sample_corpus_tokens, language = quanteda_options("language_stemmer"))
sample_corpus_tokens <- tokens_select(sample_corpus_tokens, pattern = stopwords("en"), selection = "remove")

Delete samplecorpusand sample_corpus files to free up memory since we dont need them anymore.

rm(list=c("samplecorpus","sample_corpus"))

Get frequencies of n-grams from the sample corpus

dfm_sample_corpus_tokens <- dfm(sample_corpus_tokens)
unigrams_freq <- textstat_frequency(dfm_sample_corpus_tokens) # unigram frequency
unigrs <- subset(unigrams_freq,select=c(feature,frequency))
names(unigrs) <- c("ngram","freq")
unigrs <- as.data.table(unigrs)
fwrite(unigrs,"unigrs.csv")

bigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 2))
bigrams_freq <- textstat_frequency(bigrams)  # bigram frequency
bigrs <- subset(bigrams_freq,select=c(feature,frequency))
names(bigrs) <- c("ngram","freq")
bigrs <- as.data.table(bigrs)
fwrite(bigrs,"bigrs.csv")

trigrams <- dfm(tokens_ngrams(sample_corpus_tokens, n = 3))
trigrams_freq <- textstat_frequency(trigrams) # trigram frequency
trigrs <- subset(trigrams_freq,select=c(feature,frequency))
names(trigrs) <- c("ngram","freq")
trigrs <- as.data.table(trigrs)
fwrite(trigrs,"trigrs.csv")

head(unigrs);head(bigrs);head(trigrs)
##    ngram  freq
## 1:  just 12764
## 2:   get 12473
## 3:  like 12299
## 4:   one 11425
## 5:    go 10789
## 6:  time  9818
##           ngram freq
## 1:    right_now 1067
## 2:    look_like  932
## 3:    feel_like  776
## 4:   last_night  768
## 5: look_forward  732
## 6: thank_follow  600
##               ngram freq
## 1: happi_mother_day  194
## 2:      let_us_know  123
## 3:   happi_new_year  106
## 4: look_forward_see   81
## 5:      amp_amp_amp   74
## 6:    new_york_citi   62

Delete dfm_sample_corpus_tokens, sample_corpus_tokens, and the n-gram frequency files to free up memory since we don’t need them anymore.

rm(list=c("dfm_sample_corpus_tokens","sample_corpus_tokens","unigrams_freq","bigrams_freq","trigrams_freq","bigrams","trigrams"))