library(tidyverse)
library(tidytext)
library(textclean)
library(tm)
library(keras)
use_condaenv("r-tensorflow")This post is a quick demo for using LSTM with Keras in R to perform sentiment analysis in finance news headlines. The data is available on Kaggle:
data <- read_csv("data/finance_news.csv",col_names = F) %>%
rename(
label = X1,
text = X2
)
head(data)## # A tibble: 6 x 2
## label text
## <chr> <chr>
## 1 neutral According to Gran , the company has no plans to move all production …
## 2 neutral Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…
Let’s start off with cleaning our data. I’m using R’s textclean packages to ease the process of cleaning. However, for stopwords removal, I am using tm instead since it provides better stopwords dictionary:
# clean text
data <- data %>%
mutate(
text = text %>%
str_to_lower() %>%
str_replace(" 's","'s") %>%
replace_contraction() %>%
str_to_lower() %>%
replace_url() %>%
replace_html() %>%
str_replace_all("[\\.]+", " fullstop ") %>%
str_remove_all(pattern = "[[:digit:]]") %>% # remove number
str_remove_all(pattern = "[[:punct:]]") %>%
str_remove_all(pattern = "\\$") %>%
str_remove_all(pattern = "\\%") %>%
str_remove_all("eur") %>%
str_squish(),
label = base::factor(label, levels = c("negative", "neutral", "positive")) %>%
as.numeric() %>% {. - 1}
) %>%
na.omit()
# remove stopwords
rm.stopwords <- VCorpus(VectorSource(data$text)) %>%
tm_map(removeWords,stopwords("en")) %>%
tm_map(stripWhitespace) %>%
sapply(as.character) %>%
as.data.frame(stringsAsFactors = FALSE)
data.clean <- bind_cols(rm.stopwords, data[,1]) %>%
`colnames<-`(c("text","label"))
head(data)## # A tibble: 6 x 2
## label text
## <dbl> <chr>
## 1 1 according to gran the company has no plans to move all production to ru…
## 2 1 technopolis plans to develop in stages an area of no less than square m…
## 3 0 the international electronic industry company elcoteq has laid off tens…
## 4 2 with the new production plant the company would increase its capacity t…
## 5 2 according to the companys updated strategy for the years basware target…
## 6 2 financing of aspocomps growth aspocomp is aggressively pursuing its gro…
From 9566 unique words, we’d only use 4800 most frequent words as the features to prevent overfitting on our model:
num_words <- 4800
# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>%
fit_text_tokenizer(data.clean$text)
paste(
"Total Unique Words:", length(tokenizer$word_counts),"|",
"Total Features:", num_words
)## [1] "Total Unique Words: 9566 | Total Features: 4800"
In this model, I’m using 80% of data as our training data, than split the remaining by 50/50 as validation and testing data.
library(rsample)
set.seed(100)
# split into train - test
split <- initial_split(data.clean, prop = 0.8, strata = "label")
data_train <- training(split)
data_test <- testing(split)
# split data test to test - validation
split_val <- initial_split(data.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)maxlen <- max(str_count(data.clean$text, "\\w+")) + 1 # Text cutoff
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
pad_sequences(maxlen = maxlen)
data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
pad_sequences(maxlen = maxlen)
data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
pad_sequences(maxlen = maxlen)
# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_val$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)Next, on the architecture, since we’re working with text data, we’re going to add layer_embedding to represent the words as vector. I’m also using two dropout layers here because again, to prevent the overfitting issue. And since our target is a multinomial label, I’m using softmax as the activation function:
# initiate keras model sequence
model <- keras_model_sequential()
# model
model %>%
# layer input
layer_embedding(
name = "input",
input_dim = num_words,
input_length = maxlen,
output_dim = 32
) %>%
# layer dropout
layer_dropout(
name = "embedding_dropout",
rate = 0.5
) %>%
# layer lstm
layer_lstm(
name = "lstm",
units = 64,
dropout = 0.4,
recurrent_dropout = 0.4,
return_sequences = FALSE,
) %>%
layer_dropout(
# name = "embedding_dropout",
rate = 0.5
) %>%
# layer output
layer_dense(
name = "output",
units = 3,
activation = "softmax"
)# compile the model
model %>% compile(
optimizer = "adam",
metrics = "accuracy",
loss = "categorical_crossentropy"
)
# model summary
summary(model)## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## input (Embedding) (None, 40, 32) 153600
## ________________________________________________________________________________
## embedding_dropout (Dropout) (None, 40, 32) 0
## ________________________________________________________________________________
## lstm (LSTM) (None, 64) 24832
## ________________________________________________________________________________
## dropout (Dropout) (None, 64) 0
## ________________________________________________________________________________
## output (Dense) (None, 3) 195
## ================================================================================
## Total params: 178,627
## Trainable params: 178,627
## Non-trainable params: 0
## ________________________________________________________________________________
Lastly, I fit the model with 20 epochs and 32 batch size.
# model fit settings
epochs <- 20
batch_size <- 32
# fit the model
history <- model %>% fit(
data_train_x, data_train_y,
batch_size = batch_size,
epochs = epochs,
verbose = 1,
validation_data = list(
data_test_x, data_test_y
)
)
# history plot
plot(history)## `geom_smooth()` using formula 'y ~ x'
With this model, can get around 93% accuracy on training data and 92% accuracy on validation. Finally, I’ll use the test data below to evaluate the performance on unseen data.
test data# predict on test
data_test_pred <- model %>%
predict_classes(data_test_x) %>%
as.vector()testlibrary(yardstick)
# performance on test
accuracy_vec(
truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)## [1] 0.9215524
The model looks consistent enough on the accuracy, resulting 92% of accuracy on the test data. I’ll update this post in the future to compare the performance with other models :)