# # Data Wrangling
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
# Text analysis
library(textclean)
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# Modeling
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(keras)
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following object is masked from 'package:keras':
##
## get_weights
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
##
## spec
# set conda env
use_condaenv("r-tensorflow")
data <- read_csv("data/finance_news.csv", col_names = F) %>%
rename(
label = X1,
text = X2
)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
head(data)
## # A tibble: 6 x 2
## label text
## <chr> <chr>
## 1 neutral According to Gran , the company has no plans to move all production …
## 2 neutral Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…
data <- data %>%
mutate(
text = text %>%
replace_url() %>%
replace_html() %>%
str_remove_all("@([0-9a-zA-Z_]+)") %>%
str_remove_all("#([0-9a-zA-Z_]+)") %>%
str_replace_all("[\\?]+", " questionmark ") %>%
str_replace_all("[\\!]+", " exclamationmark ") %>%
str_remove_all('[\\&]+') %>%
str_remove_all('[\\"]+') %>%
replace_contraction() %>%
replace_word_elongation() %>%
replace_internet_slang() %>%
str_remove_all(pattern = "[[:digit:]]") %>% # remove number
str_remove_all(pattern = "[[:punct:]]") %>%
str_remove_all(pattern = "\\$") %>% # remove dollar sign
str_to_lower() %>%
str_squish(),
label = base::factor(label, levels = c("negative", "neutral", "positive")) %>%
as.numeric() %>% {. - 1}
) %>%
select(text, label) %>%
na.omit() # remove NA
rm.stopwords <- VCorpus(VectorSource(data$text)) %>%
tm_map(removeWords,stopwords("en")) %>%
tm_map(stripWhitespace) %>%
sapply(as.character) %>%
as.data.frame(stringsAsFactors = FALSE)
data.clean <- bind_cols(rm.stopwords, data[,2]) %>%
`colnames<-`(c("text","label"))
head(data.clean)
## text
## 1 according gran company plans move production russia although company growing
## 2 technopolis plans develop stages area less square meters order host companies working computer technologies telecommunications statement said
## 3 international electronic industry company elcoteq laid tens employees tallinn facility contrary earlier layoffs company contracted ranks office workers daily postimees reported
## 4 new production plant company increase capacity meet expected increase demand improve use raw materials therefore increase production profitability
## 5 according company s updated strategy years basware targets longterm net sales growth range operating profit margin net sales
## 6 financing aspocomp s growth aspocomp aggressively pursuing growth strategy increasingly focusing technologically demanding hdi printed circuit boards pcbs
## label
## 1 1
## 2 1
## 3 0
## 4 2
## 5 2
## 6 2
text_tokenizer() to transform each cleaned word as separate tokensnum_words <- 1024
# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>%
fit_text_tokenizer(data.clean$text)
paste(
"Total Unique Words:", length(tokenizer$word_counts),"|",
"Total Features:", num_words
)
## [1] "Total Unique Words: 9341 | Total Features: 1024"
library(rsample)
set.seed(100)
# split into train - test
split <- initial_split(data.clean, strata = "label")
data_train <- training(split)
data_test <- testing(split)
# split data test to test - validation
split_val <- initial_split(data.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)
maxlen <- max(str_count(data.clean$text, "\\w+")) + 1 # Text cutoff
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
pad_sequences(maxlen = maxlen)
data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
pad_sequences(maxlen = maxlen)
data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
pad_sequences(maxlen = maxlen)
# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_train$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)
# initiate keras model sequence
model <- keras_model_sequential()
# model
model %>%
# layer input
layer_embedding(
name = "input",
input_dim = num_words,
input_length = maxlen,
output_dim = 32
) %>%
# layer dropout
layer_dropout(
name = "embedding_dropout",
rate = 0.5
) %>%
# layer lstm 1
layer_lstm(
name = "lstm",
units = maxlen,
dropout = 0.25,
recurrent_dropout = 0.25,
return_sequences = FALSE,
) %>%
# layer output
layer_dense(
name = "output",
units = 3,
activation = "softmax"
)
# compile the model
model %>% compile(
optimizer = "adam",
metrics = "accuracy",
loss = "categorical_crossentropy"
)
# model summary
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## input (Embedding) (None, 39, 32) 32768
## ________________________________________________________________________________
## embedding_dropout (Dropout) (None, 39, 32) 0
## ________________________________________________________________________________
## lstm (LSTM) (None, 39) 11232
## ________________________________________________________________________________
## output (Dense) (None, 3) 120
## ================================================================================
## Total params: 44,120
## Trainable params: 44,120
## Non-trainable params: 0
## ________________________________________________________________________________
# model fit settings
epochs <- 10
batch_size <- 128
# fit the model
history <- model %>% fit(
data_train_x, data_train_y,
batch_size = batch_size,
epochs = epochs,
verbose = 1,
validation_data = list(
data_test_x, data_test_y
)
)
# history plot
# plot(history)
plot
# predict on test
data_test_pred <- model %>%
predict_classes(data_test_x) %>%
as.vector()
# performance on "unseen data"
accuracy_vec(
truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)
## [1] 0.7838843
finance_news <- read_csv("data/finance_news.csv", col_names = F) %>%
rename(
label = X1,
text = X2
)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
head(finance_news)
## # A tibble: 6 x 2
## label text
## <chr> <chr>
## 1 neutral According to Gran , the company has no plans to move all production …
## 2 neutral Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…
finance_news_clean <- finance_news %>%
mutate(
text_clean = text %>%
replace_url() %>%
replace_html() %>%
str_remove_all("@([0-9a-zA-Z_]+)") %>% # remove username
str_remove_all("#([0-9a-zA-Z_]+)") %>% # remove hashtag
replace_contraction() %>%
replace_word_elongation() %>%
replace_internet_slang() %>%
str_remove_all(pattern = "[[:digit:]]") %>% # remove number
str_remove_all(pattern = "%") %>%
str_remove_all(pattern = "\\$") %>% # remove dollar sign
str_to_lower() %>% # transform to lower case
str_squish() # remove extra whitespace
) %>%
select(text, text_clean, label)
head(finance_news_clean)
## # A tibble: 6 x 3
## text text_clean label
## <chr> <chr> <chr>
## 1 According to Gran , the company h… according to gran , the company has… neutr…
## 2 Technopolis plans to develop in s… technopolis plans to develop in sta… neutr…
## 3 The international electronic indu… the international electronic indust… negat…
## 4 With the new production plant the… with the new production plant the c… posit…
## 5 According to the company 's updat… according to the company 's updated… posit…
## 6 FINANCING OF ASPOCOMP 'S GROWTH A… financing of aspocomp 's growth asp… posit…
finance_news_clean <- finance_news_clean %>%
select(-text) %>%
unnest_tokens(word, text_clean)
head(finance_news_clean)
## # A tibble: 6 x 2
## label word
## <chr> <chr>
## 1 neutral according
## 2 neutral to
## 3 neutral gran
## 4 neutral the
## 5 neutral company
## 6 neutral has
finance_news_clean <- finance_news_clean %>%
anti_join(stop_words)
## Joining, by = "word"
finance_news_clean %>%
count(word, label, sort = T) %>%
group_by(label) %>%
top_n(20) %>%
ggplot(aes(label = word, size = n, color = n)) +
ggwordcloud::geom_text_wordcloud() +
facet_wrap(~label) +
scale_size_area(max_size = 15)
## Selecting by n