Load Library

Memuat library yang akan digunakan.

library(tidyverse)
library(keras)
library(neuralnet)
library(tm)
library(yardstick)
library(textclean)

Read Data

Membaca data yang akan digunakan yaitu data berita finansial.

news <- read_csv("data/finance_news.csv", col_names = F) %>% 
  rename(
    label = X1,
    text = X2
  )

head(news)
## # A tibble: 6 × 2
##   label    text                                                                 
##   <chr>    <chr>                                                                
## 1 neutral  According to Gran , the company has no plans to move all production …
## 2 neutral  Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…

Text Pre-Processing

Melakukan Text Pre-Processing untuk menghilangkan tanda baca yang tidak diperlukan seperti tanda dollar, menggannti tanda tanya dengan tulisan “questionmark”, tanda “&”, tanda kutip, dan lain-lain.

news1 <- news %>%
  mutate(
    text = text %>%
      replace_url()  %>% 
      replace_html() %>% 
      str_remove_all("@([0-9a-zA-Z_]+)") %>% 
      str_remove_all("#([0-9a-zA-Z_]+)") %>% 
      str_replace_all("[\\?]+", " questionmark ") %>% 
      str_replace_all("[\\!]+", " exclamationmark ") %>% 
      str_remove_all('[\\&]+') %>% 
      str_remove_all('[\\"]+') %>% 
      replace_contraction() %>%
      replace_word_elongation() %>% 
      replace_internet_slang() %>% 
      str_remove_all(pattern = "[[:digit:]]") %>%
      str_remove_all(pattern = "[[:punct:]]") %>% 
      str_remove_all(pattern = "\\$") %>%
      str_to_lower() %>% 
      str_squish(), 
    label = base::factor(label, levels = c("negative", "neutral", "positive")) %>% 
                     as.numeric() %>% {. - 1}
  ) %>% 
  select(text, label) %>% 
  na.omit() # remove NA

Kita perlu juga melakukan pembersihan stopwords

# buang stopwords pada data
rm.stopwords <- VCorpus(VectorSource(news1$text)) %>%
  tm_map(removeWords,stopwords("en")) %>%
  tm_map(stripWhitespace) %>% 
  sapply(as.character) %>%
  as.data.frame(stringsAsFactors = FALSE)

# menggabungkan hasil cleaning dengan label awal
news1.clean <- bind_cols(rm.stopwords, news1[,2]) %>% 
  `colnames<-`(c("text","label"))

Tokenizer

Tahap ini untuk memisahkan tiap kata di seluruh dokumen menjadi bentuk token. Parameter num_words untuk mengatur jumlah maksimum kata yang akan digunakan, di sort berdasarkan urutan frekuensi yang terbesar. kata yang jarang muncul akan dihilangkan. dari total semua kata unique yg terdapat di data teks, kita reduksi menjadi 1024 saja yang akan digunakan untuk membuat model.

Parameter lower adalah sebuah logika kondisi, jika TRUE maka seluruh kata akan di transformasi ke huruf kecil (tolower).

num_words <- 1024 # juumlah kata yang akan dijadikan vocabulary

# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>% 
  fit_text_tokenizer(news1.clean$text)

paste(
  "Total Unique Words:", length(tokenizer$word_counts),"|",
  "Total Features:", num_words
)
## [1] "Total Unique Words: 9341 | Total Features: 1024"

Split Data

Disini akan dilakukan splitting data menjadi 3 bagian yaitu train, validation, dan test. Proporsinya sebesar 60% untuk train dan sisanya 40% di partisi untuk data validation dan testing.

library(rsample)
set.seed(123)

split <- initial_split(news1.clean,prop = 0.5, strata = "label")
data_train <- training(split)
data_test <- testing(split)

# split data test to test - validation
split_val <- initial_split(news1.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)
count(data_test, label)
##   label    n
## 1     0  302
## 2     1 1437
## 3     2  681

Split X & Y

maxlen <- max(str_count(news1.clean$text, "\\w+")) + 1 
  
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
  pad_sequences(maxlen = maxlen)

data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
  pad_sequences(maxlen = maxlen)

data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
  pad_sequences(maxlen = maxlen)

# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_val$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)

Create Model

# initiate keras model sequence
model <- keras_model_sequential()

# model
model %>%
  # layer input
  layer_embedding( 
    name = "input", 
    input_dim = num_words, 
    input_length = maxlen,
    output_dim = 32
  ) %>%
  # layer dropout
  layer_dropout(
    name = "embedding_dropout",
    rate = 0.3
  ) %>%
    # layer lstm 1
  layer_lstm(
    name = "lstm1",
    units = 100,
    dropout = 0.25,
    recurrent_dropout = 0.25,
    return_sequences = TRUE, 
  ) %>%
  # layer lstm 1
  layer_lstm(
    name = "lstm",
    units = 100,
    dropout = 0.25,
    recurrent_dropout = 0.25,
    return_sequences = FALSE, 
  ) %>%
  # layer output
  layer_dense(
    name = "output",
    units = 3,
    activation = "softmax"
  )
# compile the model
model %>% compile(
  optimizer = "adam",
  metrics = "accuracy",
  loss = "categorical_crossentropy"
)

# model summary
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## input (Embedding)                   (None, 39, 32)                  32768       
## ________________________________________________________________________________
## embedding_dropout (Dropout)         (None, 39, 32)                  0           
## ________________________________________________________________________________
## lstm1 (LSTM)                        (None, 39, 100)                 53200       
## ________________________________________________________________________________
## lstm (LSTM)                         (None, 100)                     80400       
## ________________________________________________________________________________
## output (Dense)                      (None, 3)                       303         
## ================================================================================
## Total params: 166,671
## Trainable params: 166,671
## Non-trainable params: 0
## ________________________________________________________________________________

Model Fitting

# model fit settings
epochs <- 15
batch_size <- 128

# fit the model
history <- model %>% fit(
  data_train_x, data_train_y,
  batch_size = batch_size, 
  epochs = epochs,
  verbose = 1,
  validation_data = list(
    data_test_x, data_test_y
  )
)

# history plot
plot(history)

Model Evaluation

# predict on test
data_test_pred <- model %>%
  predict_classes(data_test_x) %>%
  as.vector()

Evaluate Performance

Kita juga dapat melakukan evaluasi performa pada model dan mendapatkan nilai akurasinya.

# performance on test
performance <- accuracy_vec(
 truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
 estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)

performance
## [1] 0.7942149