library(stringr)
library(tensorflow)
library(tfdatasets)
library(keras)
library(glue)
library(ff)

# Call this if you need tensorflow installed
# install_tensorflow()

I’d like to try some text classification using the tensorflow package in R. I’m leaning on this tutorial page to get some of the basics, as I’ve only used tensorflow in python before, not R

Loading in our ham dataset

# Loading ham dataset
ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"

ham_dataset <- get_file(
  "data/20021010_easy_ham",
  ham_url,
  untar = TRUE,
  cache_dir = '.',
  cache_subdir = ''
)
## Loaded Tensorflow version 2.9.2
# Loading spam dataset
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2"

spam_dataset <- get_file(
  "data/20030228_spam_2",
  spam_url,
  untar = TRUE,
  cache_dir = '.',
  cache_subdir = ''
)
ham_dir <- file.path("easy_ham")
spam_dir <- file.path("spam_2")

# Moving our dataset files toa  "Centralized" data folder
file.move("easy_ham/", "data/easy_ham/")
## [1] FALSE
file.move("spam_2/", "data/spam_2/")
## [1] FALSE
# Renaming unzipped files in our spam and ham dirs so they have a ,txt extension
dirs <- c("spam_2", "easy_ham")

for (d in dirs) {
  dir_path <- glue("data/{d}/")
  for (f in list.files(dir_path)) {
    if (!grepl(".txt", f)) {
      new_name <- glue("{dir_path}{f}.txt")
      file.rename(glue("{dir_path}{f}"), new_name)
    }
  }
}

Generating training datasets from our directories containing the raw text files. keras looks for the sub-folders within our data folder as the sub-classes

seed <- 1234


training_data <- text_dataset_from_directory(
  'data/',
  batch_size = 64,
  validation_split = 0.1,
  subset = 'training',
  seed = seed
)

# Validation data - 10% of dataset in total ( ~ 390 files)
validation_data <- text_dataset_from_directory(
  'data/',
  batch_size = 64,
  validation_split = 0.1,
  subset = 'validation',
  seed = seed
)
re <- reticulate::import("re")

punctuation <- c("!", "\\", "\"", "#", "$", "%", "&", "'", "(", ")", "*",
"+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[",
"\\", "\\", "]", "^", "_", "`", "{", "|", "}", "~")
punctuation_group <- punctuation %>%
  sapply(re$escape) %>%
  paste0(collapse = "") %>%
  sprintf("[%s]", .)

# Writing our standardization function
standardization_function <- function(input_data) {
  lowercase <- tf$strings$lower(input_data)
  stripped_html <- tf$strings$regex_replace(lowercase, '<*>', ' ')
  tf$strings$regex_replace(
    stripped_html,
    punctuation_group,
    ""
  )
}

max_features <- 10000
sequence_length <- 250

# Cleaning and tokenizing data
vectorize_layer <- layer_text_vectorization(
  standardize = standardization_function,
  max_tokens = max_features,
  output_mode = "int",
  output_sequence_length = sequence_length
)
# Make a text-only dataset (without labels), then call adapt
train_text <-training_data %>%
  dataset_map(function(text, label) text)
vectorize_layer %>% adapt(train_text)
vectorize_text <- function(text, label) {
  text <- tf$expand_dims(text, -1L)
  list(vectorize_layer(text), label)
}

Setting up training, testing, and validation datasets

training_data_vectorized <- dataset_map(training_data, vectorize_text)
val_ds <- validation_data %>% dataset_map(vectorize_text)
model <- keras_model_sequential() %>%
  layer_embedding(max_features + 1, 16) %>%
  layer_dropout(0.2) %>%
  layer_global_average_pooling_1d() %>%
  layer_dropout(0.2) %>%
  layer_dense(1)

summary(model)
## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  embedding (Embedding)              (None, None, 16)                160016      
##  dropout_1 (Dropout)                (None, None, 16)                0           
##  global_average_pooling1d (GlobalAv  (None, 16)                     0           
##  eragePooling1D)                                                                
##  dropout (Dropout)                  (None, 16)                      0           
##  dense (Dense)                      (None, 1)                       17          
## ================================================================================
## Total params: 160,033
## Trainable params: 160,033
## Non-trainable params: 0
## ________________________________________________________________________________

Need to compile our model before training the model. This modifies the model object itself

model %>% compile(
  loss = loss_binary_crossentropy(from_logits = TRUE), # using this loss function as it's a binary outcome datase (i.e. either ham or spam)
  optimizer = 'adam',
  metrics = metric_binary_accuracy(threshold = 0)) %>%
  fit(training_data_vectorized, validation_data = val_ds, epochs = 10)

Testing our Model

Now that we have our model built and fit, let’s use it to predict spam vs ham for some other emails. Using some samples from the default spam dataset to set up our testing data set.

# Setting up our testing dataset
testing_data <- text_dataset_from_directory(
  'data/test',
  batch_size = 64
)

Setting up and evaluating our testing dataset with our model

testing_dataset <- testing_data %>% dataset_map(vectorize_text)

# Evaluate the model
model %>% evaluate(testing_dataset)
##            loss binary_accuracy 
##       1.9279082       0.6644474

Our accuracy of 66% leaves a bit to be desired.

model_sgd <- keras_model_sequential() %>%
  layer_embedding(max_features + 1, 16) %>%
  layer_dropout(0.2) %>%
  layer_global_average_pooling_1d() %>%
  layer_dropout(0.2) %>%
  layer_dense(1)

summary(model_sgd)
## Model: "sequential_1"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  embedding_1 (Embedding)            (None, None, 16)                160016      
##  dropout_3 (Dropout)                (None, None, 16)                0           
##  global_average_pooling1d_1 (Global  (None, 16)                     0           
##  AveragePooling1D)                                                              
##  dropout_2 (Dropout)                (None, 16)                      0           
##  dense_1 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 160,033
## Trainable params: 160,033
## Non-trainable params: 0
## ________________________________________________________________________________
# Compiling our model with an SGD optimizer
model_sgd %>% compile(
  loss = loss_binary_crossentropy(from_logits = TRUE),
  optimizer = 'SGD',
  metrics = metric_binary_accuracy(threshold = 0)) %>%
  fit(training_data_vectorized, validation_data = val_ds, epochs = 20)
model_sgd %>% evaluate(testing_dataset)
##            loss binary_accuracy 
##       0.6457267       0.6671105