library(stringr)
library(tensorflow)
library(tfdatasets)
library(keras)
library(glue)
library(ff)
# Call this if you need tensorflow installed
# install_tensorflow()
I’d like to try some text classification using the
tensorflow package in R. I’m leaning on this tutorial
page to get some of the basics, as I’ve only used tensorflow in
python before, not R
Loading in our ham dataset
# Loading ham dataset
ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"
ham_dataset <- get_file(
"data/20021010_easy_ham",
ham_url,
untar = TRUE,
cache_dir = '.',
cache_subdir = ''
)
## Loaded Tensorflow version 2.9.2
# Loading spam dataset
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2"
spam_dataset <- get_file(
"data/20030228_spam_2",
spam_url,
untar = TRUE,
cache_dir = '.',
cache_subdir = ''
)
ham_dir <- file.path("easy_ham")
spam_dir <- file.path("spam_2")
# Moving our dataset files toa "Centralized" data folder
file.move("easy_ham/", "data/easy_ham/")
## [1] FALSE
file.move("spam_2/", "data/spam_2/")
## [1] FALSE
# Renaming unzipped files in our spam and ham dirs so they have a ,txt extension
dirs <- c("spam_2", "easy_ham")
for (d in dirs) {
dir_path <- glue("data/{d}/")
for (f in list.files(dir_path)) {
if (!grepl(".txt", f)) {
new_name <- glue("{dir_path}{f}.txt")
file.rename(glue("{dir_path}{f}"), new_name)
}
}
}
Generating training datasets from our directories containing the raw
text files. keras
looks for the sub-folders within our data folder as the
sub-classes
seed <- 1234
training_data <- text_dataset_from_directory(
'data/',
batch_size = 64,
validation_split = 0.1,
subset = 'training',
seed = seed
)
# Validation data - 10% of dataset in total ( ~ 390 files)
validation_data <- text_dataset_from_directory(
'data/',
batch_size = 64,
validation_split = 0.1,
subset = 'validation',
seed = seed
)
re <- reticulate::import("re")
punctuation <- c("!", "\\", "\"", "#", "$", "%", "&", "'", "(", ")", "*",
"+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[",
"\\", "\\", "]", "^", "_", "`", "{", "|", "}", "~")
punctuation_group <- punctuation %>%
sapply(re$escape) %>%
paste0(collapse = "") %>%
sprintf("[%s]", .)
# Writing our standardization function
standardization_function <- function(input_data) {
lowercase <- tf$strings$lower(input_data)
stripped_html <- tf$strings$regex_replace(lowercase, '<*>', ' ')
tf$strings$regex_replace(
stripped_html,
punctuation_group,
""
)
}
max_features <- 10000
sequence_length <- 250
# Cleaning and tokenizing data
vectorize_layer <- layer_text_vectorization(
standardize = standardization_function,
max_tokens = max_features,
output_mode = "int",
output_sequence_length = sequence_length
)
# Make a text-only dataset (without labels), then call adapt
train_text <-training_data %>%
dataset_map(function(text, label) text)
vectorize_layer %>% adapt(train_text)
vectorize_text <- function(text, label) {
text <- tf$expand_dims(text, -1L)
list(vectorize_layer(text), label)
}
Setting up training, testing, and validation datasets
training_data_vectorized <- dataset_map(training_data, vectorize_text)
val_ds <- validation_data %>% dataset_map(vectorize_text)
model <- keras_model_sequential() %>%
layer_embedding(max_features + 1, 16) %>%
layer_dropout(0.2) %>%
layer_global_average_pooling_1d() %>%
layer_dropout(0.2) %>%
layer_dense(1)
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## embedding (Embedding) (None, None, 16) 160016
## dropout_1 (Dropout) (None, None, 16) 0
## global_average_pooling1d (GlobalAv (None, 16) 0
## eragePooling1D)
## dropout (Dropout) (None, 16) 0
## dense (Dense) (None, 1) 17
## ================================================================================
## Total params: 160,033
## Trainable params: 160,033
## Non-trainable params: 0
## ________________________________________________________________________________
Need to compile our model before training the model. This modifies the model object itself
model %>% compile(
loss = loss_binary_crossentropy(from_logits = TRUE), # using this loss function as it's a binary outcome datase (i.e. either ham or spam)
optimizer = 'adam',
metrics = metric_binary_accuracy(threshold = 0)) %>%
fit(training_data_vectorized, validation_data = val_ds, epochs = 10)
Now that we have our model built and fit, let’s use it to predict spam vs ham for some other emails. Using some samples from the default spam dataset to set up our testing data set.
# Setting up our testing dataset
testing_data <- text_dataset_from_directory(
'data/test',
batch_size = 64
)
Setting up and evaluating our testing dataset with our model
testing_dataset <- testing_data %>% dataset_map(vectorize_text)
# Evaluate the model
model %>% evaluate(testing_dataset)
## loss binary_accuracy
## 1.9279082 0.6644474
Our accuracy of 66% leaves a bit to be desired.
model_sgd <- keras_model_sequential() %>%
layer_embedding(max_features + 1, 16) %>%
layer_dropout(0.2) %>%
layer_global_average_pooling_1d() %>%
layer_dropout(0.2) %>%
layer_dense(1)
summary(model_sgd)
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## embedding_1 (Embedding) (None, None, 16) 160016
## dropout_3 (Dropout) (None, None, 16) 0
## global_average_pooling1d_1 (Global (None, 16) 0
## AveragePooling1D)
## dropout_2 (Dropout) (None, 16) 0
## dense_1 (Dense) (None, 1) 17
## ================================================================================
## Total params: 160,033
## Trainable params: 160,033
## Non-trainable params: 0
## ________________________________________________________________________________
# Compiling our model with an SGD optimizer
model_sgd %>% compile(
loss = loss_binary_crossentropy(from_logits = TRUE),
optimizer = 'SGD',
metrics = metric_binary_accuracy(threshold = 0)) %>%
fit(training_data_vectorized, validation_data = val_ds, epochs = 20)
model_sgd %>% evaluate(testing_dataset)
## loss binary_accuracy
## 0.6457267 0.6671105