Memuat library yang akan digunakan.
library(tidyverse)
library(keras)
library(neuralnet)
library(tm)
library(yardstick)
library(textclean)
Membaca data yang akan digunakan yaitu data berita finansial.
news <- read_csv("data/finance_news.csv", col_names = F) %>%
rename(
label = X1,
text = X2
)
head(news)
## # A tibble: 6 × 2
## label text
## <chr> <chr>
## 1 neutral According to Gran , the company has no plans to move all production …
## 2 neutral Technopolis plans to develop in stages an area of no less than 100,0…
## 3 negative The international electronic industry company Elcoteq has laid off t…
## 4 positive With the new production plant the company would increase its capacit…
## 5 positive According to the company 's updated strategy for the years 2009-2012…
## 6 positive FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing it…
Melakukan Text Pre-Processing untuk menghilangkan tanda baca yang tidak diperlukan seperti tanda dollar, menggannti tanda tanya dengan tulisan “questionmark”, tanda “&”, tanda kutip, dan lain-lain.
news1 <- news %>%
mutate(
text = text %>%
replace_url() %>%
replace_html() %>%
str_remove_all("@([0-9a-zA-Z_]+)") %>%
str_remove_all("#([0-9a-zA-Z_]+)") %>%
str_replace_all("[\\?]+", " questionmark ") %>%
str_replace_all("[\\!]+", " exclamationmark ") %>%
str_remove_all('[\\&]+') %>%
str_remove_all('[\\"]+') %>%
replace_contraction() %>%
replace_word_elongation() %>%
replace_internet_slang() %>%
str_remove_all(pattern = "[[:digit:]]") %>%
str_remove_all(pattern = "[[:punct:]]") %>%
str_remove_all(pattern = "\\$") %>%
str_to_lower() %>%
str_squish(),
label = base::factor(label, levels = c("negative", "neutral", "positive")) %>%
as.numeric() %>% {. - 1}
) %>%
select(text, label) %>%
na.omit() # remove NA
Kita perlu juga melakukan pembersihan stopwords
# buang stopwords pada data
rm.stopwords <- VCorpus(VectorSource(news1$text)) %>%
tm_map(removeWords,stopwords("en")) %>%
tm_map(stripWhitespace) %>%
sapply(as.character) %>%
as.data.frame(stringsAsFactors = FALSE)
# menggabungkan hasil cleaning dengan label awal
news1.clean <- bind_cols(rm.stopwords, news1[,2]) %>%
`colnames<-`(c("text","label"))
Tahap ini untuk memisahkan tiap kata di seluruh dokumen menjadi
bentuk token. Parameter num_words untuk mengatur jumlah
maksimum kata yang akan digunakan, di sort berdasarkan urutan frekuensi
yang terbesar. kata yang jarang muncul akan dihilangkan. dari total
semua kata unique yg terdapat di data teks, kita reduksi menjadi 1024
saja yang akan digunakan untuk membuat model.
Parameter lower adalah sebuah logika kondisi, jika TRUE
maka seluruh kata akan di transformasi ke huruf kecil (tolower).
num_words <- 1024 # juumlah kata yang akan dijadikan vocabulary
# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words, lower = TRUE) %>%
fit_text_tokenizer(news1.clean$text)
paste(
"Total Unique Words:", length(tokenizer$word_counts),"|",
"Total Features:", num_words
)
## [1] "Total Unique Words: 9341 | Total Features: 1024"
Disini akan dilakukan splitting data menjadi 3 bagian yaitu train, validation, dan test. Proporsinya sebesar 60% untuk train dan sisanya 40% di partisi untuk data validation dan testing.
library(rsample)
set.seed(123)
split <- initial_split(news1.clean,prop = 0.5, strata = "label")
data_train <- training(split)
data_test <- testing(split)
# split data test to test - validation
split_val <- initial_split(news1.clean, prop = 0.5, strata = "label")
data_val <- training(split_val)
data_test <- training(split_val)
count(data_test, label)
## label n
## 1 0 302
## 2 1 1437
## 3 2 681
maxlen <- max(str_count(news1.clean$text, "\\w+")) + 1
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text) %>%
pad_sequences(maxlen = maxlen)
data_val_x <- texts_to_sequences(tokenizer, data_val$text) %>%
pad_sequences(maxlen = maxlen)
data_test_x <- texts_to_sequences(tokenizer, data_test$text) %>%
pad_sequences(maxlen = maxlen)
# prepare y
data_train_y <- to_categorical(data_train$label, num_classes = 3)
data_val_y <- to_categorical(data_val$label, num_classes = 3)
data_test_y <- to_categorical(data_test$label, num_classes = 3)
# initiate keras model sequence
model <- keras_model_sequential()
# model
model %>%
# layer input
layer_embedding(
name = "input",
input_dim = num_words,
input_length = maxlen,
output_dim = 32
) %>%
# layer dropout
layer_dropout(
name = "embedding_dropout",
rate = 0.3
) %>%
# layer lstm 1
layer_lstm(
name = "lstm1",
units = 100,
dropout = 0.25,
recurrent_dropout = 0.25,
return_sequences = TRUE,
) %>%
# layer lstm 1
layer_lstm(
name = "lstm",
units = 100,
dropout = 0.25,
recurrent_dropout = 0.25,
return_sequences = FALSE,
) %>%
# layer output
layer_dense(
name = "output",
units = 3,
activation = "softmax"
)
# compile the model
model %>% compile(
optimizer = "adam",
metrics = "accuracy",
loss = "categorical_crossentropy"
)
# model summary
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## input (Embedding) (None, 39, 32) 32768
## ________________________________________________________________________________
## embedding_dropout (Dropout) (None, 39, 32) 0
## ________________________________________________________________________________
## lstm1 (LSTM) (None, 39, 100) 53200
## ________________________________________________________________________________
## lstm (LSTM) (None, 100) 80400
## ________________________________________________________________________________
## output (Dense) (None, 3) 303
## ================================================================================
## Total params: 166,671
## Trainable params: 166,671
## Non-trainable params: 0
## ________________________________________________________________________________
# model fit settings
epochs <- 15
batch_size <- 128
# fit the model
history <- model %>% fit(
data_train_x, data_train_y,
batch_size = batch_size,
epochs = epochs,
verbose = 1,
validation_data = list(
data_test_x, data_test_y
)
)
# history plot
plot(history)
# predict on test
data_test_pred <- model %>%
predict_classes(data_test_x) %>%
as.vector()
Kita juga dapat melakukan evaluasi performa pada model dan mendapatkan nilai akurasinya.
# performance on test
performance <- accuracy_vec(
truth = factor(data_test$label,labels = c("negative", "neutral", "positive")),
estimate = factor(data_test_pred, labels = c("negative", "neutral", "positive"))
)
performance
## [1] 0.7942149