Predicting News Sentiment with LSTM

library(tidyverse)
library(tidytext)
library(textclean)
library(tm)

library(keras)
use_condaenv("r-tensorflow")

Read data

This post is a quick demo for using LSTM with Keras in R to perform sentiment analysis in finance news headlines. The data is available on Kaggle:

data <- read_csv("data/finance_news.csv",col_names = F) %>%
  rename(
    label = X1,
    text = X2
  )

head(data)

## # A tibble: 6 x 2
##   label    text                                                                 
##   <chr>    <chr>                                                                
## 1 neutral  According to Gran , the company has no plans to move all production …
## 2 neutral  Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…

Text pre-processing

Let’s start off with cleaning our data. I’m using R’s textclean packages to ease the process of cleaning. However, for stopwords removal, I am using tm instead since it provides better stopwords dictionary:

# clean text
data <- data %>% 
  mutate(
    text = text %>%
      str_to_lower() %>% 
      str_replace(" 's","'s") %>% 
      replace_contraction() %>% 
      str_to_lower() %>% 
      replace_url() %>% 
      replace_html() %>% 
      str_replace_all("[\\.]+", " fullstop ") %>% 
      str_remove_all(pattern = "[[:digit:]]") %>% # remove number
      str_remove_all(pattern = "[[:punct:]]") %>% 
      str_remove_all(pattern = "\\$") %>% 
      str_remove_all(pattern = "\\%") %>% 
      str_remove_all("eur") %>% 
      str_squish(), 
    label = base::factor(label, levels = c("negative", "neutral", "positive")) %>% 
                     as.numeric() %>% {. - 1} 
  ) %>% 
  na.omit()

# remove stopwords
rm.stopwords <- VCorpus(VectorSource(data$text)) %>%
  tm_map(removeWords,stopwords("en")) %>%
  tm_map(stripWhitespace) %>% 
  sapply(as.character) %>%
  as.data.frame(stringsAsFactors = FALSE)

data.clean <- bind_cols(rm.stopwords, data[,1]) %>%
  `colnames<-`(c("text","label"))

head(data)

## # A tibble: 6 x 2
##   label text                                                                    
##   <dbl> <chr>                                                                   
## 1     1 according to gran the company has no plans to move all production to ru…
## 2     1 technopolis plans to develop in stages an area of no less than square m…
## 3     0 the international electronic industry company elcoteq has laid off tens…
## 4     2 with the new production plant the company would increase its capacity t…
## 5     2 according to the companys updated strategy for the years basware target…
## 6     2 financing of aspocomps growth aspocomp is aggressively pursuing its gro…

Tokenize words

From 9566 unique words, we’d only use 4800 most frequent words as the features to prevent overfitting on our model:

num_words <- 4800 

# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>% 
  fit_text_tokenizer(data.clean$text)

paste(
  "Total Unique Words:", length(tokenizer$word_counts),"|",
  "Total Features:", num_words
)

## [1] "Total Unique Words: 9566 | Total Features: 4800"

Split train-test

In this model, I’m using 80% of data as our training data, than split the remaining by 50/50 as validation and testing data.

library(rsample)
set.seed(100)

# split into train - test
split <- initial_split(data.clean, prop = 0.8, strata = "label")
data_train <- training(split)
data_test <- testing(split)

# split data test to test - validation
split_val <- initial_split(data.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)

Split x & y

maxlen <- max(str_count(data.clean$text, "\\w+")) + 1 # Text cutoff
  
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
  pad_sequences(maxlen = maxlen)

data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
  pad_sequences(maxlen = maxlen)

data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
  pad_sequences(maxlen = maxlen)

# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_val$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)

Build model

Next, on the architecture, since we’re working with text data, we’re going to add layer_embedding to represent the words as vector. I’m also using two dropout layers here because again, to prevent the overfitting issue. And since our target is a multinomial label, I’m using softmax as the activation function:

# initiate keras model sequence
model <- keras_model_sequential()

# model
model %>%
  # layer input
  layer_embedding(
    name = "input",
    input_dim = num_words,
    input_length = maxlen, 
    output_dim = 32
  ) %>%
  # layer dropout
  layer_dropout(
    name = "embedding_dropout",
    rate = 0.5
  ) %>% 
  # layer lstm 
  layer_lstm(
    name = "lstm",
    units = 64,
    dropout = 0.4,
    recurrent_dropout = 0.4,
    return_sequences = FALSE, 
  ) %>%
  layer_dropout(
   # name = "embedding_dropout",
    rate = 0.5
  ) %>% 
  # layer output
  layer_dense(
    name = "output",
    units = 3,
    activation = "softmax"
  )

# compile the model
model %>% compile(
  optimizer = "adam",
  metrics = "accuracy",
  loss = "categorical_crossentropy"
)

# model summary
summary(model)

## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## input (Embedding)                   (None, 40, 32)                  153600      
## ________________________________________________________________________________
## embedding_dropout (Dropout)         (None, 40, 32)                  0           
## ________________________________________________________________________________
## lstm (LSTM)                         (None, 64)                      24832       
## ________________________________________________________________________________
## dropout (Dropout)                   (None, 64)                      0           
## ________________________________________________________________________________
## output (Dense)                      (None, 3)                       195         
## ================================================================================
## Total params: 178,627
## Trainable params: 178,627
## Non-trainable params: 0
## ________________________________________________________________________________

Model Fitting

Lastly, I fit the model with 20 epochs and 32 batch size.

# model fit settings
epochs <- 20
batch_size <- 32

# fit the model
history <- model %>% fit(
  data_train_x, data_train_y,
  batch_size = batch_size, 
  epochs = epochs,
  verbose = 1,
  validation_data = list(
    data_test_x, data_test_y
  )
)

# history plot
plot(history)

## `geom_smooth()` using formula 'y ~ x'

With this model, can get around 93% accuracy on training data and 92% accuracy on validation. Finally, I’ll use the test data below to evaluate the performance on unseen data.

Model Evaluation

Predicting the `test` data

# predict on test
data_test_pred <- model %>% 
  predict_classes(data_test_x) %>%
  as.vector()

Evaluate performance on `test`

library(yardstick)

# performance on test
accuracy_vec(
 truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
 estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)

## [1] 0.9215524

The model looks consistent enough on the accuracy, resulting 92% of accuracy on the test data. I’ll update this post in the future to compare the performance with other models :)

Predicting News Sentiment with LSTM

Tanesya

2/26/2021

Read data

Text pre-processing

Tokenize words

Split train-test

Split x & y

Build model

Model Fitting

Model Evaluation

Predicting the `test` data

Evaluate performance on `test`

Predicting News Sentiment with LSTM

Tanesya

2/26/2021

Read data

Text pre-processing

Tokenize words

Split train-test

Split x & y

Build model

Model Fitting

Model Evaluation

Predicting the test data

Evaluate performance on test

Predicting the `test` data

Evaluate performance on `test`