Language Modeling Project

Setting up the working directory, libraries and loading the corpus.

Working directory.

setwd("D:/OneDrive - FUNIDES/Data Science/Capstone/")

Libraries.

library(stringr)
library(tm)
library(quanteda)
quanteda_options(threads = 8)
library(dplyr)
library(optimx)

The corpus.

According to Wikipedia “a corpus (plural corpora) or text corpus is a large and structured set of texts (nowadays usually electronically stored and processed). In corpus linguistics, they are used to do statistical analysis and hypothesis testing, checking occurrences or validating linguistic rules within a specific language territory”.

In the present case the corpus consists of texts from blogs, news and twitter posts in US English language that was provided by Swiftkey in the context of the capstone project for the data science specialization from John Hopkins University in Coursera.

nlp_cap <- VCorpus(DirSource("final/en_US/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
summary(nlp_cap)

                  Length Class             Mode
en_US.blogs.txt   2      PlainTextDocument list
en_US.news.txt    2      PlainTextDocument list
en_US.twitter.txt 2      PlainTextDocument list

Partitioning the corpora.

Making the partitions.

In the context of the algorithm that will be used below it is nesesary to partition the corpora into a training, validation and test sets. The partion is as folows 90% of the corpora will be used for the training set, 5 percent for the validation set and 5 percent for the test set.

bases <- c("blogs", "news", "twitter")

for(i in 1:3){
        assign(paste("train", i, sep = ""), 1:floor(length(nlp_cap[[i]]$content)*0.9))
        assign(paste("valid", i, sep = ""), floor(length(nlp_cap[[i]]$content)*0.9):floor(length(nlp_cap[[i]]$content)*0.95))
        assign(paste("test", i, sep = ""), floor(length(nlp_cap[[i]]$content)*0.95):floor(length(nlp_cap[[i]]$content)))
               }

h <- 1

for (j in bases) {
        assign(paste("train", j, sep = "_"), nlp_cap[[h]]$content[get(paste("train", h, sep = ""))])
        assign(paste("valid", j, sep = "_"), nlp_cap[[h]]$content[get(paste("valid", h, sep = ""))])
        assign(paste("test", j, sep = "_"), nlp_cap[[h]]$content[get(paste("test", h, sep = ""))])
        h <- h + 1
        }

Storing the samples.

dir.create("train")
file.create("train/train_blogs.txt","train/train_news.txt","train/train_twitter.txt", overwrite = TRUE)

con1 <- file("train/train_blogs.txt", open = "wt")
train_blogs <- gsub("_"," ",train_blogs)
write(train_blogs, con1)
close(con1)

con1 <- file("train/train_news.txt", open = "wt")
train_news <- gsub("_"," ",train_news)
write(train_news, con1)
close(con1)

con1 <- file("train/train_twitter.txt", open = "wt")
train_twitter <- gsub("_"," ",train_twitter)
write(train_twitter, con1)
close(con1)

dir.create("valid")
file.create("valid/valid_blogs.txt", "valid/valid_news.txt", "valid/valid_twitter.txt", overwrite = TRUE)

con1 <- file("valid/valid_blogs.txt", open = "wt")
valid_blogs <- gsub("_"," ",valid_blogs)
write(valid_blogs, con1)
close(con1)

con1 <- file("valid/valid_news.txt", open = "wt")
valid_news <- gsub("_"," ",valid_news)
write(valid_news, con1)
close(con1)

con1 <- file("valid/valid_twitter.txt", open = "wt")
valid_twitter <- gsub("_"," ",valid_twitter)
write(valid_twitter, con1)
close(con1)

dir.create("test")
file.create("test/test_blogs.txt", "test/test_news.txt", "test/test_twitter.txt", overwrite = TRUE)

con1 <- file("test/test_blogs.txt", open = "wt")
test_blogs <- gsub("_"," ",test_blogs)
write(test_blogs, con1)
close(con1)

con1 <- file("test/test_news.txt", open = "wt")
test_news <- gsub("_"," ",test_news)
write(test_news, con1)
close(con1)

con1 <- file("test/test_twitter.txt", open = "wt")
test_twitter <- gsub("_"," ",test_twitter)
write(test_twitter, con1)
close(con1)

Cleaning up the corpora.

The corpora must be loaded in the proper language, alphabet and type of computer caracters.

rm(nlp_cap, con1, valid_blogs, valid_news, valid_twitter, test_blogs, test_news, test_twitter, train1, train2, train3, valid1, valid2, valid3, test1, test2, test3, train_blogs, train_news, train_twitter, bases, h, i, j)

nlp_cap_train <- VCorpus(DirSource("train/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))

for(i in 1:3){
        nlp_cap_train[[i]]$content <- iconv(nlp_cap_train[[i]]$content, "latin1", "ASCII",sub='')
}

Exploratory analysis of the training sample.

Below are summary statistics of the training sample.

rm(bases, i)
nlp_cap_train <- VCorpus(VectorSource(nlp_cap_train))
nlp_cap_train_quant <- corpus(nlp_cap_train)
rm(nlp_cap_train)
summary(nlp_cap_train_quant)

Corpus consisting of 3 documents:

  Text  Types   Tokens Sentences       datetimestamp id language
 text1 460484 38158677   1868983 2018-10-08 23:59:56  1       en
 text2 109343  2754162    129974 2018-10-08 23:59:56  2       en
 text3 530068 32934054   2329808 2018-10-08 23:59:56  3       en

Source: Converted from tm Corpus 'nlp_cap_train'
Created: Mon Oct 08 18:00:08 2018
Notes:

Creating 1-grams, 2-grams and 3-grams.

According to wikipedia “an n-gram is a contiguous sequence of n items from a given sample of text or speech”. In the case of this project items corresponds to words.

Tokenizing the corpora.

Converting the corpora into to tokens that do not contain numbers, puntctuation symbols, twiter symbols, urls, and other symbols.

tok <- tokens(nlp_cap_train_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)
rm(nlp_cap_train_quant)

1-grams tokens.

Table with the 10 most frequent unigrams in the training set.

gram_1 <- tokens_ngrams(tok, n = 1)
gram_1_dfm <- dfm(gram_1)
uni_top <- topfeatures(gram_1_dfm, 10)
uni_top

    the      to     and       a       i      of      in     you      is 
2651100 1729622 1436613 1416193 1355643 1163466  920852  767501  731563 
    for 
 697608

rm(gram_1)

Coverting the 1-gram tokens into a data table for easier manipulation.

gram_1_dt <- tbl_df(textstat_frequency(gram_1_dfm)) %>%
        rename(last_term = feature) %>%
        mutate(first_term = last_term) %>%
        mutate(middle_term = last_term) %>%
        select(first_term, middle_term, last_term, frequency) %>%
        filter(frequency > 7)

rm(gram_1_dfm)

2-grams tokens.

Table with the 10 most frequent bigrams in the training set.

gram_2 <- tokens_ngrams(tok, n = 2)
gram_2_dfm <- dfm(gram_2)
bi_top <- topfeatures(gram_2_dfm, 10)
bi_top

 of_the  in_the for_the  to_the  on_the   to_be  at_the  i_have and_the 
 232335  222209  123952  122597  116608  106665   80420   72095   70189 
  i_was 
  68251

rm(gram_2)

Coverting the 2-gram tokens into a data table for easier manipulation.

gram_2_dt <- tbl_df(textstat_frequency(gram_2_dfm)) 

gram_2_dt <- mutate(gram_2_dt, first_term = gsub("_.*$", "", gram_2_dt$feature), last_term = gsub(".*_", "", gram_2_dt$feature)) %>%
        mutate(middle_term_1 = first_term) %>%
        mutate(middle_term_2 = last_term) %>%
        select(first_term, middle_term_1, middle_term_2, last_term, frequency) %>%
        filter(frequency > 7)

rm(gram_2_dfm)

3-grams tokens.

Table with the 10 most frequent trigrams in the training set.

gram_3 <- tokens_ngrams(tok, n = 3)
gram_3_dfm <- dfm(gram_3)
tri_top <- topfeatures(gram_3_dfm, 10)
tri_top

    thanks_for_the         one_of_the           a_lot_of 
             21411              18944              17427 
         i_want_to            to_be_a        going_to_be 
             11952              11849              11469 
          i_have_a looking_forward_to          i_have_to 
              9838               9509               9376 
          it_was_a 
              9279

rm(tok, gram_3)

Coverting the 3-gram tokens into a data table for easier manipulation.

gram_3_dt <- tbl_df(textstat_frequency(gram_3_dfm))

gram_3_dt <- mutate(gram_3_dt, first_term_bi = gsub("_[^_]*$","",gram_3_dt$feature)) 

gram_3_dt <- mutate(gram_3_dt,last_term = gsub(".*_", "", gram_3_dt$feature)) %>%
        mutate(first_term = gsub("_.*$", "", gram_3_dt$first_term_bi)) %>%
        mutate(middle_term = gsub(".*_", "", gram_3_dt$first_term_bi)) %>%
        select(first_term, middle_term, last_term, frequency) %>%
        filter(frequency > 7)

rm(gram_3_dfm)

Creating a trigram model using the method of linear interpolation.

The method of linear interpolation for a trigram model consists of the following steps:

Compute the ML probabilities for unigrams according to:

\[P(w_1) = \frac{count(w_1)}{\sum{count(w_1)}}\] 2. Compute the ML probabilities for bigrams according to:

\[P(w_2 | w_1) = \frac{count(w_2, w_1)}{\sum{count(w_1)}}\] 3. Compute the ML probabilities for trigrams according to:

\[P(w_3 | w_2, w_1) = \frac{count(w_3, w_2, w_1)}{\sum{count(w_2, w_1)}}\] 4. The data is very sparse so in order to solve the problem we can compute:

\[P(w_3 | w_2, w_1) = \lambda_1P(w_3 | w_2, w_1) + \lambda_2P(w_3 | w_2) + \lambda_3P(w_3)\] with \[\lambda_1 > 0, \lambda_2 > 0, \lambda_3 > 0\]

and \[\lambda_1 + \lambda_2 + \lambda_3 = 1\]

Choose the \[\lambda_1, \lambda_2, \lambda_3\] in a way that maximizes the folowing log-likelihood funtion:

\[L(\lambda_3, \lambda_2, \lambda_1) = \sum(count(w_1^*,w_2^*,w_3^*)log(P(w_3 | w_2, w_1)))\]

were: \[count(w_1^*,w_2^*,w_3^*)\] are counts on the validation set.

Preparing the training set data set for interpolation.

Calculating counts and probabilities for unigrams, bigrams, and trigrams.

# Tri-gram calculate its probability

gram_3_dt_prob <- group_by(gram_3_dt, .dots = c("first_term", "middle_term")) %>%
        mutate(sum_freq = sum(frequency)) %>%
        ungroup() %>%
        mutate(final_prob_tri = frequency/sum_freq) %>%
        arrange(desc(frequency)) %>%
        select(first_term, middle_term, last_term, frequency, sum_freq, final_prob_tri)
       
# bi-gram calculate its probability

gram_2_dt_prob <- group_by(gram_2_dt, first_term) %>%
        mutate(sum_freq = sum(frequency)) %>%
        ungroup() %>%
        mutate(final_prob_bi = frequency/sum_freq) %>%
        arrange(desc(frequency)) %>%
        rename(middle_term = middle_term_1) %>%
        select(middle_term, last_term, frequency, sum_freq, final_prob_bi)

# uni-gram calculate its probability

gram_1_dt_prob <- mutate(gram_1_dt, sum_freq = sum(frequency)) %>%
        mutate(final_prob_uni = frequency/sum_freq) %>%
        arrange(desc(frequency)) %>%
        select(last_term, frequency, sum_freq, final_prob_uni)

# Joining data bases
rm(gram_1_dt, gram_2_dt, gram_3_dt)

data_training <- left_join(gram_1_dt_prob, gram_2_dt_prob, by = "last_term" ) %>%
        mutate(final_prob_bi = if_else(is.na(final_prob_bi ), 0, final_prob_bi ))
data_training <- left_join(data_training, gram_3_dt_prob, by = c("middle_term","last_term")) %>%
        mutate(final_prob_tri  = if_else(is.na(final_prob_tri ), 0, final_prob_tri)) %>%
        select(first_term, middle_term, last_term, final_prob_tri, final_prob_bi, final_prob_uni)

data_training <- mutate(data_training, first_term = if_else(is.na(first_term), middle_term, first_term)) %>%
        mutate(middle_term = if_else(is.na(middle_term), last_term, middle_term), first_term = if_else(is.na(first_term), middle_term, first_term))
        
# Cleaning up memory
rm(gram_1_dt_prob, gram_2_dt_prob, gram_3_dt_prob)

Using validation to tune in the parameters of the training set.

Creating trigrams from the validation set for determining the lambdas for interpolation.

nlp_cap_valid <- VCorpus(DirSource("valid/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))

for(i in 1:3){
        nlp_cap_valid[[i]]$content <- iconv(nlp_cap_valid[[i]]$content, "latin1", "ASCII",sub='')
}

nlp_cap_valid_quant <- corpus(nlp_cap_valid)
rm(nlp_cap_valid)
summary(nlp_cap_valid_quant)

Corpus consisting of 3 documents:

              Text Types  Tokens Sentences       datetimestamp
   valid_blogs.txt 89378 2125212    103918 2018-10-09 00:16:37
    valid_news.txt 21355  152322      7197 2018-10-09 00:16:37
 valid_twitter.txt 91615 1832766    130000 2018-10-09 00:16:37
                id language
   valid_blogs.txt    en_US
    valid_news.txt    en_US
 valid_twitter.txt    en_US

Source: Converted from tm Corpus 'nlp_cap_valid'
Created: Mon Oct 08 18:16:38 2018
Notes:

tok <- tokens(nlp_cap_valid_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)

rm(nlp_cap_valid_quant)

gram_3 <- tokens_ngrams(tok, n = 3)

gram_3_dfm <- dfm(gram_3)

gram_3_dt_val <- tbl_df(textstat_frequency(gram_3_dfm)) 

gram_3_dt_val <- mutate(gram_3_dt_val, first_term_bi = gsub("_[^_]*$","",gram_3_dt_val$feature)) 

gram_3_dt_val <- mutate(gram_3_dt_val,last_term = gsub(".*_", "", gram_3_dt_val$feature)) %>%
        mutate(first_term = gsub("_.*$", "", gram_3_dt_val$first_term_bi)) %>%
        mutate(middle_term = gsub(".*_", "", gram_3_dt_val$first_term_bi)) %>%
        select(first_term, middle_term, last_term, frequency) %>%
        filter(frequency > 7)

Table with the 10 most frequent trigrams in the validation set.

topfeatures(gram_3_dfm, 10)

    thanks_for_the         one_of_the           a_lot_of 
              1171               1085                959 
         i_want_to            to_be_a        going_to_be 
               683                640                621 
          i_have_a looking_forward_to           it_was_a 
               545                528                509 
         i_have_to 
               495

rm(gram_3_dfm, tok, gram_3, i)

Data and function for calculating the interpolated probabilitiies of the trigrams.

data_inter <- inner_join(data_training, gram_3_dt_val, by = c("first_term", "middle_term", "last_term"))
calc_interp_prob <- function(lambda) {
data <- mutate(data_inter, prob_inter = lambda[1]*final_prob_tri + lambda[2]*final_prob_bi + lambda[3]*final_prob_uni) %>%
                mutate(log_prob_inter = log(prob_inter)) %>%
                mutate(loss_inter = frequency*log_prob_inter) %>%
                summarise(cost_inter = sum(loss_inter))
        
        l <- as.numeric(data[[1]])
        return(l)
}

Finding the optimal values of lambdas by maximizing the log-likelihood presented above.

params <- c(0.15, 0.1, 0.1)
ui1 <- cbind(c(1,0,0,-1,0,0,-1), c(0,1,0,0,-1,0,-1), c(0,0,1,0,0,-1,-1))
ci1 <- rbind(0,0,0,-1,-1,-1,-1)

opt <- constrOptim(params, calc_interp_prob, NULL ,method = "Nelder-Mead", ui = ui1, ci = ci1, outer.iterations = 1000, outer.eps = 0.000000000001, control = list(fnscale = -1))

The values of the optimal lambdas are: 0.6, 0.4, 0.

Calculating the trigram probabilities of the training set using the optimal lambdas.

data_training <- mutate(data_training, prob = opt$par[1]*final_prob_tri+opt$par[2]*final_prob_bi+opt$par[3]*final_prob_uni) %>%
        select(first_term, middle_term, last_term, prob)

Evaluating the model on the test set.

Perplexity of the model.

The perplexity of a language model is defined as:

\[2^{-l}\] where \[l = \frac{1}{N}\sum(log_2(P(w_3 | w_2, w_1)))\] The interpretation is that under the uniform probability model the probability of a word is \[\frac{1}{N}\] and perplexity \[N\] is the size of the vocabulary but if with the model you get for example a perplexity of 100 the model reduces the vocabulary size to efectively 100.

Creating trigrams from the test set for calculating perplexity.

nlp_cap_test <- VCorpus(DirSource("test/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))

for(i in 1:3){
        nlp_cap_test[[i]]$content <- iconv(nlp_cap_test[[i]]$content, "latin1", "ASCII",sub='')
}

nlp_cap_test_quant <- corpus(nlp_cap_test)
rm(nlp_cap_test)
summary(nlp_cap_test_quant)

Corpus consisting of 3 documents:

             Text Types  Tokens Sentences       datetimestamp
   test_blogs.txt 90482 2145590    105868 2018-10-09 00:17:29
    test_news.txt 21521  151184      7107 2018-10-09 00:17:29
 test_twitter.txt 91812 1831916    129618 2018-10-09 00:17:29
               id language
   test_blogs.txt    en_US
    test_news.txt    en_US
 test_twitter.txt    en_US

Source: Converted from tm Corpus 'nlp_cap_test'
Created: Mon Oct 08 18:17:31 2018
Notes:

tok <- tokens(nlp_cap_test_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)

rm(nlp_cap_test_quant)

gram_3 <- tokens_ngrams(tok, n = 3)

gram_3_dfm <- dfm(gram_3)

gram_3_dt_test <- tbl_df(textstat_frequency(gram_3_dfm)) 

gram_3_dt_test <- mutate(gram_3_dt_test, first_term_bi = gsub("_[^_]*$","",gram_3_dt_test$feature)) 

gram_3_dt_test <- mutate(gram_3_dt_test,last_term = gsub(".*_", "", gram_3_dt_test$feature)) %>%
        mutate(first_term = gsub("_.*$", "", gram_3_dt_test$first_term_bi)) %>%
        mutate(middle_term = gsub(".*_", "", gram_3_dt_test$first_term_bi)) %>%
        select(first_term, middle_term, last_term, frequency) %>%
        filter(frequency > 7)

Table with the 10 most frequent trigrams in the test set.

topfeatures(gram_3_dfm, 10)

    thanks_for_the         one_of_the           a_lot_of 
              1221               1071                981 
           to_be_a        going_to_be          i_want_to 
               732                641                640 
          it_was_a           i_have_a         the_end_of 
               542                539                524 
looking_forward_to 
               524

rm(gram_3_dfm, tok, gram_3, i)

Calculating preplexity of the model on the validation set.

perplexity_val <- inner_join(data_training, gram_3_dt_val) %>%
        mutate(log_prob = log2(prob)) %>%
        summarise(sum_log_prob = sum(log_prob), N = n()) %>%
        mutate(perplexity = 2^((-1/N)*sum_log_prob))

Calculating preplexity of the model on the test set.

perplexity_test <- inner_join(data_training, gram_3_dt_test) %>%
        mutate(log_prob = log2(prob)) %>%
        summarise(sum_log_prob = sum(log_prob), N = n()) %>%
        mutate(perplexity = 2^((-1/N)*sum_log_prob))

The perplexity of the model in the validation set is 17.3 and the perplexity of the model in the test set is 17.4which is less than what is found by Katz using his model on 100 sentences with trigrams.

Exporting the data.

write.table(data_training, "trigraminterp/data_training.txt", sep="\t")
write.table(uni_top, "uni_top.txt", sep="\t")
write.table(bi_top, "bi_top.txt", sep="\t")
write.table(tri_top, "tri_top.txt", sep="\t")

References:

Collins M. (2013) Language Modeling [Course Notes]. Retrieved from Columbia University CS Website: http://www.cs.columbia.edu/~mcollins/lm-spring2013.pdf