# url <- "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
#
# filename <- basename(url)
#
# options(timeout = 60 * 10)
#
# download.file(url, destfile = filename)
#
# untar(filename)STAT654 Homework 5 - Dan Hoang cu2107
Run as much of the code as you can on your computer from Chapter 11.
TRANSFORMER MODEL
Starting in 2017, a new model architecture started overtaking recurrent neural networks across most natural language processing tasks: the Transformer. Transformers were introduced in the seminal paper “Attention Is All You Need” by Vaswani et al. Neural attention has fast become one of the most influential ideas in deep learning.
Like most neural networks, transformer models are basically large a encoder/decoder blocks that process data. One of the big innovation is that transformers process the entire input all at once, for example with text data, transformer process all the word in sentence at the same time. This can be done by adding a mechanism call self-attention. By adding Positional encoding, and multi-headed attention model. In theory, each head would learn something different therefore giving the encoder model more representation power.
Here we will go through transformer model, and will use transformer to do a text-classification task on IMDB reviews database.
Preparing text data
Preparing the IMDB movie reviews data
# unlink("aclImdb", recursive = TRUE)
# untar("aclImdb_v1.tar.gz")Take a look at the content of a few of these text files
# ## -------------------------------------------------------------------------
# fs::dir_tree("aclImdb", recurse = 1)
# ## ---- eval = TRUE---------------------------------------------------------
# fs::dir_delete("aclImdb/train/unsup/")
# ## -------------------------------------------------------------------------
# writeLines(readLines("aclImdb/train/pos/4077_10.txt", warn = FALSE))Next, let’s prepare a validation set by setting apart 20% of the training text files in a new directory, aclImdb/val.
## ---- eval = TRUE---------------------------------------------------------
# library(fs)
# set.seed(1337)
# base_dir <- path("aclImdb")
#
# for (category in c("neg", "pos")) {
# filepaths <- dir_ls(base_dir / "train" / category)
# num_val_samples <- round(0.2 * length(filepaths))
# val_files <- sample(filepaths, num_val_samples)
#
# dir_create(base_dir / "val" / category)
# file_move(val_files,
# base_dir / "val" / category)
# }Now we create three TF Dataset objects for training, validation, and testing:
library(keras)
library(tfdatasets)
train_ds <- text_dataset_from_directory("aclImdb/train")
val_ds <- text_dataset_from_directory("aclImdb/val")
test_ds <- text_dataset_from_directory("aclImdb/test")Processing words as a sequence.
Preprocessing our datasets with layer_text_vectorization(),
Configuring layer_text_vectorization() to return bigrams
## -------------------------------------------------------------------------
text_vectorization <-
layer_text_vectorization(max_tokens = 20000,
output_mode = "multi_hot")
text_only_train_ds <- train_ds %>%
dataset_map(function(x, y) x)
adapt(text_vectorization, text_only_train_ds)
binary_1gram_train_ds <- train_ds %>%
dataset_map( ~ list(text_vectorization(.x), .y),
num_parallel_calls = 4)
binary_1gram_val_ds <- val_ds %>%
dataset_map( ~ list(text_vectorization(.x), .y),
num_parallel_calls = 4)
binary_1gram_test_ds <- test_ds %>%
dataset_map( ~ list(text_vectorization(.x), .y),
num_parallel_calls = 4)## -------------------------------------------------------------------------
dataset_vectorize <- function(dataset) {
dataset %>%
dataset_map(~ list(text_vectorization(.x), .y),
num_parallel_calls = 4)
}Preparing integer sequence datasets
## -------------------------------------------------------------------------
max_length <- 600
max_tokens <- 20000
text_vectorization <- layer_text_vectorization(
max_tokens = max_tokens,
output_mode = "int",
output_sequence_length = max_length
)
adapt(text_vectorization, text_only_train_ds)
int_train_ds <- train_ds %>% dataset_vectorize()
int_val_ds <- val_ds %>% dataset_vectorize()
int_test_ds <- test_ds %>% dataset_vectorize()The Transformer architecture
Word embeddings are vector representations of words that achieve exactly this: they map human language into a structured geometric space. Whereas the vectors obtained through one-hot encoding are binary, sparse (mostly made of zeros), and very high-dimensional
Understanding self-attention
It’s a simple yet powerful idea: not all input information seen by a model is equally important to the task at hand, so models should “pay more attention” to some features and “pay less attention” to other features. This idea is similar to Max pooling in convnets and TF-IDF normalization assigns importance scores to tokens based on how much information different tokens are likely to carry.
A Transformer is a sequence-to-sequence model: it was designed to convert one sequence into another.
Transformer encoder implemented as a subclassed Layer, here we use the Transformer Encoder for text classification.
## -------------------------------------------------------------------------
layer_transformer_encoder <- new_layer_class(
classname = "TransformerEncoder",
initialize = function(embed_dim, dense_dim, num_heads, ...) {
super$initialize(...)
self$embed_dim <- embed_dim
self$dense_dim <- dense_dim
self$num_heads <- num_heads
self$attention <-
layer_multi_head_attention(num_heads = num_heads,
key_dim = embed_dim)
self$dense_proj <- keras_model_sequential() %>%
layer_dense(dense_dim, activation = "relu") %>%
layer_dense(embed_dim)
self$layernorm_1 <- layer_layer_normalization()
self$layernorm_2 <- layer_layer_normalization()
},
call = function(inputs, mask = NULL) {
if (!is.null(mask))
mask <- mask[, tf$newaxis, ]
inputs %>%
{ self$attention(., ., attention_mask = mask) + . } %>%
self$layernorm_1() %>%
{ self$dense_proj(.) + . } %>%
self$layernorm_2()
},
get_config = function() {
config <- super$get_config()
for(name in c("embed_dim", "num_heads", "dense_dim"))
config[[name]] <- self[[name]]
config
}
)USING POSITIONAL ENCODING TO REINJECT ORDER INFORMATION
The idea behind positional encoding is very simple: to give the model access to word-order information, we’re going to add the word’s position in the sentence to each word embedding. Our input word embeddings will have two components: the usual word vector, which represents the word independently of any specific context, and a position vector, which represents the position of the word in the current sentence. Hopefully, the model will then figure out how to best leverage this additional information.
## -------------------------------------------------------------------------
layer_positional_embedding <- new_layer_class(
classname = "PositionalEmbedding",
initialize = function(sequence_length, input_dim, output_dim, ...) {
super$initialize(...)
self$token_embeddings <-
layer_embedding(input_dim = input_dim,
output_dim = output_dim)
self$position_embeddings <-
layer_embedding(input_dim = sequence_length,
output_dim = output_dim)
self$sequence_length <- sequence_length
self$input_dim <- input_dim
self$output_dim <- output_dim
},
call = function(inputs) {
length <- tf$shape(inputs)[-1]
positions <- tf$range(start = 0L, limit = length, delta = 1L)
embedded_tokens <- self$token_embeddings(inputs)
embedded_positions <- self$position_embeddings(positions)
embedded_tokens + embedded_positions
},
compute_mask = function(inputs, mask = NULL) {
inputs != 0
},
get_config = function() {
config <- super$get_config()
for(name in c("output_dim", "sequence_length", "input_dim"))
config[[name]] <- self[[name]]
config
}
)Combining the Transformer encoder with positional embedding
## -------------------------------------------------------------------------
vocab_size <- 20000
sequence_length <- 600
embed_dim <- 256
num_heads <- 2
dense_dim <- 32
inputs <- layer_input(shape(NULL), dtype = "int64")
outputs <- inputs %>%
layer_positional_embedding(sequence_length, vocab_size, embed_dim) %>%
layer_transformer_encoder(embed_dim, dense_dim, num_heads) %>%
layer_global_average_pooling_1d() %>%
layer_dropout(0.5) %>%
layer_dense(1, activation = "sigmoid")Warning: Negative numbers are interpreted python-style when subsetting tensorflow tensors.
See: ?`[.tensorflow.tensor` for details.
To turn off this warning, set `options(tensorflow.extract.warn_negatives_pythonic = FALSE)`
model <-
keras_model(inputs, outputs) %>%
compile(optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = "accuracy")
modelModel: "model"
________________________________________________________________________________
Layer (type) Output Shape Param #
================================================================================
input_1 (InputLayer) [(None, None)] 0
positional_embedding (PositionalEm (None, None, 256) 5273600
bedding)
transformer_encoder (TransformerEn (None, None, 256) 543776
coder)
global_average_pooling1d (GlobalAv (None, 256) 0
eragePooling1D)
dropout (Dropout) (None, 256) 0
dense (Dense) (None, 1) 257
================================================================================
Total params: 5,817,633
Trainable params: 5,817,633
Non-trainable params: 0
________________________________________________________________________________
Training and evaluating the Transformer encoder–based model
## -------------------------------------------------------------------------
callbacks <- list(
callback_model_checkpoint("full_transformer_encoder.keras",
save_best_only = TRUE)
)
model %>% fit(
int_train_ds,
validation_data = int_val_ds,
epochs = 20,
callbacks = callbacks
)
## -------------------------------------------------------------------------
model <- load_model_tf(
"full_transformer_encoder.keras",
custom_objects = list(layer_transformer_encoder,
layer_positional_embedding))
cat(sprintf(
"Test acc: %.3f\n", evaluate(model, int_test_ds)["accuracy"]))Test acc: 0.881
Conclusion.
We get to 88.6% test accuracy—an improvement that demonstrates the value of word-order information for text classification.
Bag-of-words models vs Transformer-based sequence models.
We may think that bag-of-words methods are outdated comparing to large models like transformer. However, its not always true. In fact, among the various techniques that we’ve tried on the IMDB dataset throughout this chapter, the best performing so far was the bag-of-bigrams! So depending on what task or dataset you’re looking at, we can choose the best model that can work effectively not only in performance but also in timing, storage optimization.
Beyond text classification: Sequence-to-sequence learning.
Machine translation—Convert a paragraph in a source language to its equivalent in a target language.
Text summarization—Convert a long document to a shorter version that retains the most important information.
Question answering—Convert an input question into its answer.
Chatbots—Convert a dialogue prompt into a reply to this prompt, or convert the history of a conversation into the next reply in the conversation.
Text generation—Convert a text prompt into a paragraph that completes the prompt.
And so forth.