Case Study: Finance News dataset Sentiment Analysis

Library Needed

# # Data Wrangling
library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
# Text analysis
library(textclean)
library(tidytext)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# Modeling
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(keras)
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
## 
## Attaching package: 'yardstick'
## The following object is masked from 'package:keras':
## 
##     get_weights
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
## 
##     spec
# set conda env
use_condaenv("r-tensorflow")
data <- read_csv("data/finance_news.csv", col_names = F) %>% 
  rename(
    label = X1,
    text = X2
  )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
head(data)
## # A tibble: 6 x 2
##   label    text                                                                 
##   <chr>    <chr>                                                                
## 1 neutral  According to Gran , the company has no plans to move all production …
## 2 neutral  Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…

Text Pre-processing

Text Cleaning

  • clean the text data by removing some unnecessary symbols, urls, etc
  • formatting text
  • change label ‘negative’ = 0, ‘neutral’ = 1, ‘positive’ = 2
data <- data %>%
  mutate(
    text = text %>%
      replace_url()  %>% 
      replace_html() %>% 
      str_remove_all("@([0-9a-zA-Z_]+)") %>% 
      str_remove_all("#([0-9a-zA-Z_]+)") %>% 
      str_replace_all("[\\?]+", " questionmark ") %>% 
      str_replace_all("[\\!]+", " exclamationmark ") %>% 
      str_remove_all('[\\&]+') %>% 
      str_remove_all('[\\"]+') %>% 
      replace_contraction() %>%
      replace_word_elongation() %>% 
      replace_internet_slang() %>% 
      str_remove_all(pattern = "[[:digit:]]") %>% # remove number
      str_remove_all(pattern = "[[:punct:]]") %>% 
      str_remove_all(pattern = "\\$") %>% # remove dollar sign
      str_to_lower() %>% 
      str_squish(), 
    label = base::factor(label, levels = c("negative", "neutral", "positive")) %>% 
                     as.numeric() %>% {. - 1}
  ) %>% 
  select(text, label) %>% 
  na.omit() # remove NA

Remove stopwords

  • Removed stopwords based on ‘English’ corpus data
rm.stopwords <- VCorpus(VectorSource(data$text)) %>%
  tm_map(removeWords,stopwords("en")) %>%
  tm_map(stripWhitespace) %>% 
  sapply(as.character) %>%
  as.data.frame(stringsAsFactors = FALSE)

data.clean <- bind_cols(rm.stopwords, data[,2]) %>%
  `colnames<-`(c("text","label"))
head(data.clean)
##                                                                                                                                                                                text
## 1                                                                                                      according gran company plans move production russia although company growing
## 2                                     technopolis plans develop stages area less square meters order host companies working computer technologies telecommunications statement said
## 3  international electronic industry company elcoteq laid tens employees tallinn facility contrary earlier layoffs company contracted ranks office workers daily postimees reported
## 4                                new production plant company increase capacity meet expected increase demand improve use raw materials therefore increase production profitability
## 5                                                      according company s updated strategy years basware targets longterm net sales growth range operating profit margin net sales
## 6                        financing aspocomp s growth aspocomp aggressively pursuing growth strategy increasingly focusing technologically demanding hdi printed circuit boards pcbs
##   label
## 1     1
## 2     1
## 3     0
## 4     2
## 5     2
## 6     2

Tokenizer

  • using Keras text_tokenizer() to transform each cleaned word as separate tokens
num_words <- 1024

# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>% 
  fit_text_tokenizer(data.clean$text)

paste(
  "Total Unique Words:", length(tokenizer$word_counts),"|",
  "Total Features:", num_words
)
## [1] "Total Unique Words: 9341 | Total Features: 1024"

Split Data

library(rsample)
set.seed(100)

# split into train - test
split <- initial_split(data.clean, strata = "label")
data_train <- training(split)
data_test <- testing(split)

# split data test to test - validation
split_val <- initial_split(data.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)

Text to sequence

maxlen <- max(str_count(data.clean$text, "\\w+")) + 1 # Text cutoff
  
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
  pad_sequences(maxlen = maxlen)

data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
  pad_sequences(maxlen = maxlen)

data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
  pad_sequences(maxlen = maxlen)

# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_train$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)

Build Architecture

# initiate keras model sequence
model <- keras_model_sequential()

# model
model %>%
  # layer input
  layer_embedding(
    name = "input",
    input_dim = num_words,
    input_length = maxlen, 
    output_dim = 32
  ) %>%
  # layer dropout
  layer_dropout(
    name = "embedding_dropout",
    rate = 0.5
  ) %>%
  # layer lstm 1
  layer_lstm(
    name = "lstm",
    units = maxlen,
    dropout = 0.25,
    recurrent_dropout = 0.25,
    return_sequences = FALSE, 
  ) %>%
  # layer output
  layer_dense(
    name = "output",
    units = 3,
    activation = "softmax"
  )

Model Compiling

# compile the model
model %>% compile(
  optimizer = "adam",
  metrics = "accuracy",
  loss = "categorical_crossentropy"
)

# model summary
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## input (Embedding)                   (None, 39, 32)                  32768       
## ________________________________________________________________________________
## embedding_dropout (Dropout)         (None, 39, 32)                  0           
## ________________________________________________________________________________
## lstm (LSTM)                         (None, 39)                      11232       
## ________________________________________________________________________________
## output (Dense)                      (None, 3)                       120         
## ================================================================================
## Total params: 44,120
## Trainable params: 44,120
## Non-trainable params: 0
## ________________________________________________________________________________

Model Training

# model fit settings
epochs <- 10
batch_size <- 128

# fit the model
history <- model %>% fit(
  data_train_x, data_train_y,
  batch_size = batch_size, 
  epochs = epochs,
  verbose = 1,
  validation_data = list(
    data_test_x, data_test_y
  )
)

# history plot
# plot(history)

plot

Model Evaluation

# predict on test
data_test_pred <- model %>%
  predict_classes(data_test_x) %>%
  as.vector()

# performance on "unseen data"
accuracy_vec(
 truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
 estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)
## [1] 0.7838843

Create a Wordcloud based on clean data

Data pre-processing

finance_news <- read_csv("data/finance_news.csv", col_names = F) %>% 
  rename(
    label = X1,
    text = X2
  )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
head(finance_news)
## # A tibble: 6 x 2
##   label    text                                                                 
##   <chr>    <chr>                                                                
## 1 neutral  According to Gran , the company has no plans to move all production …
## 2 neutral  Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…
finance_news_clean <- finance_news %>%
  mutate(
    text_clean = text %>%
      replace_url()  %>% 
      replace_html() %>% 
      str_remove_all("@([0-9a-zA-Z_]+)") %>% # remove username
      str_remove_all("#([0-9a-zA-Z_]+)") %>% # remove hashtag
      replace_contraction() %>%
      replace_word_elongation() %>% 
      replace_internet_slang() %>% 
      str_remove_all(pattern = "[[:digit:]]") %>% # remove number
      str_remove_all(pattern = "%") %>% 
      str_remove_all(pattern = "\\$") %>% # remove dollar sign
      str_to_lower() %>% # transform to lower case
      str_squish()  # remove extra whitespace
  ) %>% 
  select(text, text_clean, label)

head(finance_news_clean)
## # A tibble: 6 x 3
##   text                               text_clean                           label 
##   <chr>                              <chr>                                <chr> 
## 1 According to Gran , the company h… according to gran , the company has… neutr…
## 2 Technopolis plans to develop in s… technopolis plans to develop in sta… neutr…
## 3 The international electronic indu… the international electronic indust… negat…
## 4 With the new production plant the… with the new production plant the c… posit…
## 5 According to the company 's updat… according to the company 's updated… posit…
## 6 FINANCING OF ASPOCOMP 'S GROWTH A… financing of aspocomp 's growth asp… posit…

Tokenize

finance_news_clean <- finance_news_clean %>% 
  select(-text) %>% 
  unnest_tokens(word, text_clean) 

head(finance_news_clean)
## # A tibble: 6 x 2
##   label   word     
##   <chr>   <chr>    
## 1 neutral according
## 2 neutral to       
## 3 neutral gran     
## 4 neutral the      
## 5 neutral company  
## 6 neutral has

Remove Stopwords

finance_news_clean <- finance_news_clean %>% 
  anti_join(stop_words)
## Joining, by = "word"

Text Visualization

finance_news_clean %>% 
  count(word, label, sort = T) %>% 

  group_by(label) %>%
  top_n(20) %>%
  
  ggplot(aes(label = word, size = n, color = n)) +
  ggwordcloud::geom_text_wordcloud() +
  facet_wrap(~label) +
  scale_size_area(max_size = 15) 
## Selecting by n