setwd("D:/OneDrive - FUNIDES/Data Science/Capstone/")
library(stringr)
library(tm)
library(quanteda)
quanteda_options(threads = 8)
library(dplyr)
library(optimx)
According to Wikipedia “a corpus (plural corpora) or text corpus is a large and structured set of texts (nowadays usually electronically stored and processed). In corpus linguistics, they are used to do statistical analysis and hypothesis testing, checking occurrences or validating linguistic rules within a specific language territory”.
In the present case the corpus consists of texts from blogs, news and twitter posts in US English language that was provided by Swiftkey in the context of the capstone project for the data science specialization from John Hopkins University in Coursera.
nlp_cap <- VCorpus(DirSource("final/en_US/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
summary(nlp_cap)
Length Class Mode
en_US.blogs.txt 2 PlainTextDocument list
en_US.news.txt 2 PlainTextDocument list
en_US.twitter.txt 2 PlainTextDocument list
In the context of the algorithm that will be used below it is nesesary to partition the corpora into a training, validation and test sets. The partion is as folows 90% of the corpora will be used for the training set, 5 percent for the validation set and 5 percent for the test set.
bases <- c("blogs", "news", "twitter")
for(i in 1:3){
assign(paste("train", i, sep = ""), 1:floor(length(nlp_cap[[i]]$content)*0.9))
assign(paste("valid", i, sep = ""), floor(length(nlp_cap[[i]]$content)*0.9):floor(length(nlp_cap[[i]]$content)*0.95))
assign(paste("test", i, sep = ""), floor(length(nlp_cap[[i]]$content)*0.95):floor(length(nlp_cap[[i]]$content)))
}
h <- 1
for (j in bases) {
assign(paste("train", j, sep = "_"), nlp_cap[[h]]$content[get(paste("train", h, sep = ""))])
assign(paste("valid", j, sep = "_"), nlp_cap[[h]]$content[get(paste("valid", h, sep = ""))])
assign(paste("test", j, sep = "_"), nlp_cap[[h]]$content[get(paste("test", h, sep = ""))])
h <- h + 1
}
dir.create("train")
file.create("train/train_blogs.txt","train/train_news.txt","train/train_twitter.txt", overwrite = TRUE)
con1 <- file("train/train_blogs.txt", open = "wt")
train_blogs <- gsub("_"," ",train_blogs)
write(train_blogs, con1)
close(con1)
con1 <- file("train/train_news.txt", open = "wt")
train_news <- gsub("_"," ",train_news)
write(train_news, con1)
close(con1)
con1 <- file("train/train_twitter.txt", open = "wt")
train_twitter <- gsub("_"," ",train_twitter)
write(train_twitter, con1)
close(con1)
dir.create("valid")
file.create("valid/valid_blogs.txt", "valid/valid_news.txt", "valid/valid_twitter.txt", overwrite = TRUE)
con1 <- file("valid/valid_blogs.txt", open = "wt")
valid_blogs <- gsub("_"," ",valid_blogs)
write(valid_blogs, con1)
close(con1)
con1 <- file("valid/valid_news.txt", open = "wt")
valid_news <- gsub("_"," ",valid_news)
write(valid_news, con1)
close(con1)
con1 <- file("valid/valid_twitter.txt", open = "wt")
valid_twitter <- gsub("_"," ",valid_twitter)
write(valid_twitter, con1)
close(con1)
dir.create("test")
file.create("test/test_blogs.txt", "test/test_news.txt", "test/test_twitter.txt", overwrite = TRUE)
con1 <- file("test/test_blogs.txt", open = "wt")
test_blogs <- gsub("_"," ",test_blogs)
write(test_blogs, con1)
close(con1)
con1 <- file("test/test_news.txt", open = "wt")
test_news <- gsub("_"," ",test_news)
write(test_news, con1)
close(con1)
con1 <- file("test/test_twitter.txt", open = "wt")
test_twitter <- gsub("_"," ",test_twitter)
write(test_twitter, con1)
close(con1)
The corpora must be loaded in the proper language, alphabet and type of computer caracters.
rm(nlp_cap, con1, valid_blogs, valid_news, valid_twitter, test_blogs, test_news, test_twitter, train1, train2, train3, valid1, valid2, valid3, test1, test2, test3, train_blogs, train_news, train_twitter, bases, h, i, j)
nlp_cap_train <- VCorpus(DirSource("train/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
for(i in 1:3){
nlp_cap_train[[i]]$content <- iconv(nlp_cap_train[[i]]$content, "latin1", "ASCII",sub='')
}
Below are summary statistics of the training sample.
rm(bases, i)
nlp_cap_train <- VCorpus(VectorSource(nlp_cap_train))
nlp_cap_train_quant <- corpus(nlp_cap_train)
rm(nlp_cap_train)
summary(nlp_cap_train_quant)
Corpus consisting of 3 documents:
Text Types Tokens Sentences datetimestamp id language
text1 460484 38158677 1868983 2018-10-08 23:59:56 1 en
text2 109343 2754162 129974 2018-10-08 23:59:56 2 en
text3 530068 32934054 2329808 2018-10-08 23:59:56 3 en
Source: Converted from tm Corpus 'nlp_cap_train'
Created: Mon Oct 08 18:00:08 2018
Notes:
According to wikipedia “an n-gram is a contiguous sequence of n items from a given sample of text or speech”. In the case of this project items corresponds to words.
Converting the corpora into to tokens that do not contain numbers, puntctuation symbols, twiter symbols, urls, and other symbols.
tok <- tokens(nlp_cap_train_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)
rm(nlp_cap_train_quant)
Table with the 10 most frequent unigrams in the training set.
gram_1 <- tokens_ngrams(tok, n = 1)
gram_1_dfm <- dfm(gram_1)
uni_top <- topfeatures(gram_1_dfm, 10)
uni_top
the to and a i of in you is
2651100 1729622 1436613 1416193 1355643 1163466 920852 767501 731563
for
697608
rm(gram_1)
Coverting the 1-gram tokens into a data table for easier manipulation.
gram_1_dt <- tbl_df(textstat_frequency(gram_1_dfm)) %>%
rename(last_term = feature) %>%
mutate(first_term = last_term) %>%
mutate(middle_term = last_term) %>%
select(first_term, middle_term, last_term, frequency) %>%
filter(frequency > 7)
rm(gram_1_dfm)
Table with the 10 most frequent bigrams in the training set.
gram_2 <- tokens_ngrams(tok, n = 2)
gram_2_dfm <- dfm(gram_2)
bi_top <- topfeatures(gram_2_dfm, 10)
bi_top
of_the in_the for_the to_the on_the to_be at_the i_have and_the
232335 222209 123952 122597 116608 106665 80420 72095 70189
i_was
68251
rm(gram_2)
Coverting the 2-gram tokens into a data table for easier manipulation.
gram_2_dt <- tbl_df(textstat_frequency(gram_2_dfm))
gram_2_dt <- mutate(gram_2_dt, first_term = gsub("_.*$", "", gram_2_dt$feature), last_term = gsub(".*_", "", gram_2_dt$feature)) %>%
mutate(middle_term_1 = first_term) %>%
mutate(middle_term_2 = last_term) %>%
select(first_term, middle_term_1, middle_term_2, last_term, frequency) %>%
filter(frequency > 7)
rm(gram_2_dfm)
Table with the 10 most frequent trigrams in the training set.
gram_3 <- tokens_ngrams(tok, n = 3)
gram_3_dfm <- dfm(gram_3)
tri_top <- topfeatures(gram_3_dfm, 10)
tri_top
thanks_for_the one_of_the a_lot_of
21411 18944 17427
i_want_to to_be_a going_to_be
11952 11849 11469
i_have_a looking_forward_to i_have_to
9838 9509 9376
it_was_a
9279
rm(tok, gram_3)
Coverting the 3-gram tokens into a data table for easier manipulation.
gram_3_dt <- tbl_df(textstat_frequency(gram_3_dfm))
gram_3_dt <- mutate(gram_3_dt, first_term_bi = gsub("_[^_]*$","",gram_3_dt$feature))
gram_3_dt <- mutate(gram_3_dt,last_term = gsub(".*_", "", gram_3_dt$feature)) %>%
mutate(first_term = gsub("_.*$", "", gram_3_dt$first_term_bi)) %>%
mutate(middle_term = gsub(".*_", "", gram_3_dt$first_term_bi)) %>%
select(first_term, middle_term, last_term, frequency) %>%
filter(frequency > 7)
rm(gram_3_dfm)
The method of linear interpolation for a trigram model consists of the following steps:
\[P(w_1) = \frac{count(w_1)}{\sum{count(w_1)}}\] 2. Compute the ML probabilities for bigrams according to:
\[P(w_2 | w_1) = \frac{count(w_2, w_1)}{\sum{count(w_1)}}\] 3. Compute the ML probabilities for trigrams according to:
\[P(w_3 | w_2, w_1) = \frac{count(w_3, w_2, w_1)}{\sum{count(w_2, w_1)}}\] 4. The data is very sparse so in order to solve the problem we can compute:
\[P(w_3 | w_2, w_1) = \lambda_1P(w_3 | w_2, w_1) + \lambda_2P(w_3 | w_2) + \lambda_3P(w_3)\] with \[\lambda_1 > 0, \lambda_2 > 0, \lambda_3 > 0\]
and \[\lambda_1 + \lambda_2 + \lambda_3 = 1\]
\[L(\lambda_3, \lambda_2, \lambda_1) = \sum(count(w_1^*,w_2^*,w_3^*)log(P(w_3 | w_2, w_1)))\]
were: \[count(w_1^*,w_2^*,w_3^*)\] are counts on the validation set.
Calculating counts and probabilities for unigrams, bigrams, and trigrams.
# Tri-gram calculate its probability
gram_3_dt_prob <- group_by(gram_3_dt, .dots = c("first_term", "middle_term")) %>%
mutate(sum_freq = sum(frequency)) %>%
ungroup() %>%
mutate(final_prob_tri = frequency/sum_freq) %>%
arrange(desc(frequency)) %>%
select(first_term, middle_term, last_term, frequency, sum_freq, final_prob_tri)
# bi-gram calculate its probability
gram_2_dt_prob <- group_by(gram_2_dt, first_term) %>%
mutate(sum_freq = sum(frequency)) %>%
ungroup() %>%
mutate(final_prob_bi = frequency/sum_freq) %>%
arrange(desc(frequency)) %>%
rename(middle_term = middle_term_1) %>%
select(middle_term, last_term, frequency, sum_freq, final_prob_bi)
# uni-gram calculate its probability
gram_1_dt_prob <- mutate(gram_1_dt, sum_freq = sum(frequency)) %>%
mutate(final_prob_uni = frequency/sum_freq) %>%
arrange(desc(frequency)) %>%
select(last_term, frequency, sum_freq, final_prob_uni)
# Joining data bases
rm(gram_1_dt, gram_2_dt, gram_3_dt)
data_training <- left_join(gram_1_dt_prob, gram_2_dt_prob, by = "last_term" ) %>%
mutate(final_prob_bi = if_else(is.na(final_prob_bi ), 0, final_prob_bi ))
data_training <- left_join(data_training, gram_3_dt_prob, by = c("middle_term","last_term")) %>%
mutate(final_prob_tri = if_else(is.na(final_prob_tri ), 0, final_prob_tri)) %>%
select(first_term, middle_term, last_term, final_prob_tri, final_prob_bi, final_prob_uni)
data_training <- mutate(data_training, first_term = if_else(is.na(first_term), middle_term, first_term)) %>%
mutate(middle_term = if_else(is.na(middle_term), last_term, middle_term), first_term = if_else(is.na(first_term), middle_term, first_term))
# Cleaning up memory
rm(gram_1_dt_prob, gram_2_dt_prob, gram_3_dt_prob)
nlp_cap_valid <- VCorpus(DirSource("valid/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
for(i in 1:3){
nlp_cap_valid[[i]]$content <- iconv(nlp_cap_valid[[i]]$content, "latin1", "ASCII",sub='')
}
nlp_cap_valid_quant <- corpus(nlp_cap_valid)
rm(nlp_cap_valid)
summary(nlp_cap_valid_quant)
Corpus consisting of 3 documents:
Text Types Tokens Sentences datetimestamp
valid_blogs.txt 89378 2125212 103918 2018-10-09 00:16:37
valid_news.txt 21355 152322 7197 2018-10-09 00:16:37
valid_twitter.txt 91615 1832766 130000 2018-10-09 00:16:37
id language
valid_blogs.txt en_US
valid_news.txt en_US
valid_twitter.txt en_US
Source: Converted from tm Corpus 'nlp_cap_valid'
Created: Mon Oct 08 18:16:38 2018
Notes:
tok <- tokens(nlp_cap_valid_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)
rm(nlp_cap_valid_quant)
gram_3 <- tokens_ngrams(tok, n = 3)
gram_3_dfm <- dfm(gram_3)
gram_3_dt_val <- tbl_df(textstat_frequency(gram_3_dfm))
gram_3_dt_val <- mutate(gram_3_dt_val, first_term_bi = gsub("_[^_]*$","",gram_3_dt_val$feature))
gram_3_dt_val <- mutate(gram_3_dt_val,last_term = gsub(".*_", "", gram_3_dt_val$feature)) %>%
mutate(first_term = gsub("_.*$", "", gram_3_dt_val$first_term_bi)) %>%
mutate(middle_term = gsub(".*_", "", gram_3_dt_val$first_term_bi)) %>%
select(first_term, middle_term, last_term, frequency) %>%
filter(frequency > 7)
Table with the 10 most frequent trigrams in the validation set.
topfeatures(gram_3_dfm, 10)
thanks_for_the one_of_the a_lot_of
1171 1085 959
i_want_to to_be_a going_to_be
683 640 621
i_have_a looking_forward_to it_was_a
545 528 509
i_have_to
495
rm(gram_3_dfm, tok, gram_3, i)
data_inter <- inner_join(data_training, gram_3_dt_val, by = c("first_term", "middle_term", "last_term"))
calc_interp_prob <- function(lambda) {
data <- mutate(data_inter, prob_inter = lambda[1]*final_prob_tri + lambda[2]*final_prob_bi + lambda[3]*final_prob_uni) %>%
mutate(log_prob_inter = log(prob_inter)) %>%
mutate(loss_inter = frequency*log_prob_inter) %>%
summarise(cost_inter = sum(loss_inter))
l <- as.numeric(data[[1]])
return(l)
}
params <- c(0.15, 0.1, 0.1)
ui1 <- cbind(c(1,0,0,-1,0,0,-1), c(0,1,0,0,-1,0,-1), c(0,0,1,0,0,-1,-1))
ci1 <- rbind(0,0,0,-1,-1,-1,-1)
opt <- constrOptim(params, calc_interp_prob, NULL ,method = "Nelder-Mead", ui = ui1, ci = ci1, outer.iterations = 1000, outer.eps = 0.000000000001, control = list(fnscale = -1))
The values of the optimal lambdas are: 0.6, 0.4, 0.
data_training <- mutate(data_training, prob = opt$par[1]*final_prob_tri+opt$par[2]*final_prob_bi+opt$par[3]*final_prob_uni) %>%
select(first_term, middle_term, last_term, prob)
The perplexity of a language model is defined as:
\[2^{-l}\] where \[l = \frac{1}{N}\sum(log_2(P(w_3 | w_2, w_1)))\] The interpretation is that under the uniform probability model the probability of a word is \[\frac{1}{N}\] and perplexity \[N\] is the size of the vocabulary but if with the model you get for example a perplexity of 100 the model reduces the vocabulary size to efectively 100.
nlp_cap_test <- VCorpus(DirSource("test/"), readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
for(i in 1:3){
nlp_cap_test[[i]]$content <- iconv(nlp_cap_test[[i]]$content, "latin1", "ASCII",sub='')
}
nlp_cap_test_quant <- corpus(nlp_cap_test)
rm(nlp_cap_test)
summary(nlp_cap_test_quant)
Corpus consisting of 3 documents:
Text Types Tokens Sentences datetimestamp
test_blogs.txt 90482 2145590 105868 2018-10-09 00:17:29
test_news.txt 21521 151184 7107 2018-10-09 00:17:29
test_twitter.txt 91812 1831916 129618 2018-10-09 00:17:29
id language
test_blogs.txt en_US
test_news.txt en_US
test_twitter.txt en_US
Source: Converted from tm Corpus 'nlp_cap_test'
Created: Mon Oct 08 18:17:31 2018
Notes:
tok <- tokens(nlp_cap_test_quant ,remove_numbers = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_url = TRUE, remove_symbols = TRUE)
rm(nlp_cap_test_quant)
gram_3 <- tokens_ngrams(tok, n = 3)
gram_3_dfm <- dfm(gram_3)
gram_3_dt_test <- tbl_df(textstat_frequency(gram_3_dfm))
gram_3_dt_test <- mutate(gram_3_dt_test, first_term_bi = gsub("_[^_]*$","",gram_3_dt_test$feature))
gram_3_dt_test <- mutate(gram_3_dt_test,last_term = gsub(".*_", "", gram_3_dt_test$feature)) %>%
mutate(first_term = gsub("_.*$", "", gram_3_dt_test$first_term_bi)) %>%
mutate(middle_term = gsub(".*_", "", gram_3_dt_test$first_term_bi)) %>%
select(first_term, middle_term, last_term, frequency) %>%
filter(frequency > 7)
Table with the 10 most frequent trigrams in the test set.
topfeatures(gram_3_dfm, 10)
thanks_for_the one_of_the a_lot_of
1221 1071 981
to_be_a going_to_be i_want_to
732 641 640
it_was_a i_have_a the_end_of
542 539 524
looking_forward_to
524
rm(gram_3_dfm, tok, gram_3, i)
perplexity_val <- inner_join(data_training, gram_3_dt_val) %>%
mutate(log_prob = log2(prob)) %>%
summarise(sum_log_prob = sum(log_prob), N = n()) %>%
mutate(perplexity = 2^((-1/N)*sum_log_prob))
perplexity_test <- inner_join(data_training, gram_3_dt_test) %>%
mutate(log_prob = log2(prob)) %>%
summarise(sum_log_prob = sum(log_prob), N = n()) %>%
mutate(perplexity = 2^((-1/N)*sum_log_prob))
The perplexity of the model in the validation set is 17.3 and the perplexity of the model in the test set is 17.4which is less than what is found by Katz using his model on 100 sentences with trigrams.
write.table(data_training, "trigraminterp/data_training.txt", sep="\t")
write.table(uni_top, "uni_top.txt", sep="\t")
write.table(bi_top, "bi_top.txt", sep="\t")
write.table(tri_top, "tri_top.txt", sep="\t")
Collins M. (2013) Language Modeling [Course Notes]. Retrieved from Columbia University CS Website: http://www.cs.columbia.edu/~mcollins/lm-spring2013.pdf