library(keras)
library(RVerbalExpressions)
library(magrittr)
library(textclean)
library(tidyverse)
library(tidytext)
library(rsample)
library(yardstick)
library(caret)# Please run the code down below
twitter_raw <- read.csv("data-input/twitter_sexism_parsed_dataset.csv",
stringsAsFactors = FALSE,
encoding = "UTF-8")
head(twitter_raw)#> index id
#> 1 5.35198627292254E+017 5.35198627292254E+017
#> 2 5.75984924030714E+017 5.75984924030714E+017
#> 3 5.7233536016588E+017 5.7233536016588E+017
#> 4 5.72337925708374E+017 5.72337925708374E+017
#> 5 4.43033024528011E+017 4.43033024528011E+017
#> 6 5.69577286308987E+017 5.69577286308987E+017
#> Text
#> 1 RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together.
#> 2 There's some very hate able teams this year #MKR
#> 3 RT @The_Eccles: "Everyone underestimated us" \nWe still do, as well as underestimating the judging #MKR
#> 4 RT @NOTLukeDarcy: did @Channel7 or #MKR actually check if any of these people could cook? #WITB
#> 5 No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it.
#> 6 RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy
#> Annotation oh_label
#> 1 sexism 1
#> 2 none 0
#> 3 none 0
#> 4 none 0
#> 5 sexism 1
#> 6 sexism 1
glimpse(twitter_raw)#> Rows: 14,881
#> Columns: 5
#> $ index <chr> "5.35198627292254E+017", "5.75984924030714E+017", "5.723353…
#> $ id <chr> "5.35198627292254E+017", "5.75984924030714E+017", "5.723353…
#> $ Text <chr> "RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if yo…
#> $ Annotation <chr> "sexism", "none", "none", "none", "sexism", "sexism", "none…
#> $ oh_label <int> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,…
twitter_clean <- twitter_raw %>%
select(Text, oh_label) %>%
mutate(oh_label = as.factor(oh_label))
head(twitter_clean)#> Text
#> 1 RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together.
#> 2 There's some very hate able teams this year #MKR
#> 3 RT @The_Eccles: "Everyone underestimated us" \nWe still do, as well as underestimating the judging #MKR
#> 4 RT @NOTLukeDarcy: did @Channel7 or #MKR actually check if any of these people could cook? #WITB
#> 5 No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it.
#> 6 RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy
#> oh_label
#> 1 1
#> 2 0
#> 3 0
#> 4 0
#> 5 1
#> 6 1
text <- twitter_clean %>% filter(oh_label == "1")
head(text$Text,5)#> [1] "RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together."
#> [2] "No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it."
#> [3] "RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy"
#> [4] "RT @MetalBarbieDoll: But yea, apparently #GamerGate women are sock puppets....lol. http://t.co/4WouDly45a"
#> [5] ":D @nkrause11 Dudes who go to culinary school: #why #findawife #notsexist :)"
# Calculate label frequencies
label_freq <- table(twitter_clean$oh_label)
# Convert label frequencies to a data frame for plotting
label_freq_df <- data.frame(label = names(label_freq), frequency = label_freq)
# Create a bar plot
ggplot(data = label_freq_df, aes(x = label, y = frequency.Freq, fill = label)) +
geom_bar(stat = "identity") +
labs(title = "Label Frequency",
x = "Label",
y = "Frequency") +
theme_minimal()
Insigth Label Pada dataset ini imbalance, oleh
karenanya akan dilakukan undersampling agar model tidak overfiting
# Step 2: Check label imbalance
label_counts <- table(twitter_clean$oh_label) # Assuming "label_column" is the column name containing the class labels
print(label_counts)#>
#> 0 1
#> 11501 3377
# Step 3: Undersampling (if needed)
# Find the minority class
minority_class <- names(which.min(label_counts))
# Find the number of samples in the minority class
minority_count <- min(label_counts)
# Filter data for each class and perform undersampling
data_undersampled <- twitter_clean %>%
drop_na(oh_label) %>%
group_by(oh_label) %>%
sample_n(size = minority_count, replace = TRUE) %>%
ungroup()
# Print the undersampled dataset
print(data_undersampled)#> # A tibble: 6,754 × 2
#> Text oh_label
#> <chr> <fct>
#> 1 THISE NOISES WHILE HE IS WAGGING HIS TAIL. wtf http://t.co/YEkXYKq3… 0
#> 2 It's Kat & Andre vs Katie & Nikki. In the last redemption r… 0
#> 3 @MaxBlumenthal @yaghiadam75 http://t.co/rLWZo41yxJ 0
#> 4 @PhilipMills8 @PsychBarakat @jncatron And there is no Palestine and… 0
#> 5 RT @Nayadrikj: GO AWAY PHOTOBOMB, YOU'RE CREEPY #ThanksGrandma http… 0
#> 6 @banhammor it's not true. see last 2 tweets. 0
#> 7 Can't ask campers to wait for breakfast. Gotta give them what they … 0
#> 8 @cooperq and he is defending it right now. ugh. 0
#> 9 Smears on a plate, for real ? #MKR 0
#> 10 If your company has an abuse, social media, or community team, we w… 0
#> # ℹ 6,744 more rows
Menggunakan Cara pembersihan text yang umumnya digunakan pada dataset Twitter, referensi dari algotech
dollar <- rx() %>%
rx_find("$")
number <- rx_digit()
punctuation <- rx_punctuation()
exclamation <- rx() %>%
rx_find(value = "!") %>%
rx_one_or_more()
question <- rx() %>%
rx_find(value = "?") %>%
rx_one_or_more()
hashtag <- rx() %>%
rx_find(value = "#") %>%
rx_alnum() %>%
rx_one_or_more()
mention <- rx() %>%
rx_find(value = "@") %>%
rx_alnum() %>%
rx_one_or_more()twitter_clean <- data_undersampled %>%
mutate(
text_clean = Text %>%
replace_url() %>%
replace_emoji() %>%
replace_emoticon() %>%
replace_html() %>%
str_remove_all(pattern = mention) %>%
str_remove_all(pattern = hashtag) %>%
replace_contraction() %>%
replace_word_elongation() %>%
str_replace_all(pattern = question, replacement = "questionmark") %>%
str_replace_all(pattern = exclamation, replacement = "exclamationmark") %>%
str_remove_all(pattern = punctuation) %>%
str_remove_all(pattern = number) %>%
str_remove_all(pattern = dollar) %>%
str_to_lower() %>%
str_squish()
)twitter_clean %>%
select(Text, text_clean) %>%
sample_n(5)#> # A tibble: 5 × 2
#> Text text_clean
#> <chr> <chr>
#> 1 @gamergatetxt @Cernowatch @PlayDangerously gaters trigger too many… gaters tr…
#> 2 @TheGags29 You make many baseless assumptions and decide they are … you make …
#> 3 RT @Catherine_Sarah: THAT IS THE DESERT!!! oh gawd if they don't g… rt that i…
#> 4 Listen, you're sexist. @TommyJohn15 Listen, I'm not sexist, but ev… listen yo…
#> 5 RT @victorymonk: @MGTOWKnight @highwiregirl oh, snap!! Here is wha… rt oh sna…
data <- twitter_clean %>%
select(text_clean, oh_label) %>%
na.omit()
head(data, 10)#> # A tibble: 10 × 2
#> text_clean oh_label
#> <chr> <fct>
#> 1 "thise noises while he is wagging his tail wtf" 0
#> 2 "it is kat andre vs katie nikki in the last redemption round instan… 0
#> 3 "" 0
#> 4 "and there is no palestine and never has been a palestine" 0
#> 5 "rt go away photobomb you are creepy" 0
#> 6 "it is not true see last tweets" 0
#> 7 "can not ask campers to wait for breakfast gotta give them what the… 0
#> 8 "and he is defending it right now ugh" 0
#> 9 "smears on a plate for real questionmark" 0
#> 10 "if your company has an abuse social media or community team we wan… 0
library(tm)
tdm <- TermDocumentMatrix(data$text_clean)
length(rownames(tdm))#> [1] 8828
num_words <- 1024
# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words,
lower = TRUE) %>%
fit_text_tokenizer(data$text_clean)set.seed(100)
intrain <- initial_split(data = data, prop = 0.8, strata = "oh_label")
data_train <- training(intrain)
data_test <- testing(intrain)
set.seed(100)
inval <- initial_split(data = data_test, prop = 0.5, strata = "oh_label")
data_val <- training(inval)
data_test <- testing(inval)maxlen <- max(str_count(data$text_clean, "\\w+")) + 1
paste("maxiumum length words in data:", maxlen)#> [1] "maxiumum length words in data: 33"
# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text_clean) %>%
pad_sequences(maxlen = maxlen)
data_val_x <- texts_to_sequences(tokenizer, data_val$text_clean) %>%
pad_sequences(maxlen = maxlen)
data_test_x <- texts_to_sequences(tokenizer, data_test$text_clean) %>%
pad_sequences(maxlen = maxlen)
# prepare y
data_train_y <- to_categorical(data_train$oh_label, num_classes = 2)
data_val_y <- to_categorical(data_val$oh_label, num_classes = 2)
data_test_y <- to_categorical(data_test$oh_label, num_classes = 2)# initiate keras model sequence
model <- keras_model_sequential()
# model
model %>%
# layer input
layer_embedding(
name = "input",
input_dim = num_words,
input_length = maxlen,
output_dim = 32,
embeddings_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
) %>%
# layer dropout
layer_dropout(
name = "embedding_dropout",
rate = 0.5
) %>%
# layer lstm 1
layer_lstm(
name = "lstm",
units = 256,
dropout = 0.2,
recurrent_dropout = 0.2,
return_sequences = FALSE,
recurrent_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2),
kernel_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
) %>%
# layer output
layer_dense(
name = "output",
units = 2,
activation = "softmax",
kernel_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
)# compile the model
model %>% compile(
optimizer = "adam",
metrics = "accuracy",
loss = "categorical_crossentropy"
)
# model summary
summary(model)#> Model: "sequential"
#> ________________________________________________________________________________
#> Layer (type) Output Shape Param #
#> ================================================================================
#> input (Embedding) (None, 33, 32) 32768
#> ________________________________________________________________________________
#> embedding_dropout (Dropout) (None, 33, 32) 0
#> ________________________________________________________________________________
#> lstm (LSTM) (None, 256) 295936
#> ________________________________________________________________________________
#> output (Dense) (None, 2) 514
#> ================================================================================
#> Total params: 329,218
#> Trainable params: 329,218
#> Non-trainable params: 0
#> ________________________________________________________________________________
# model fit settings
epochs <- 10
batch_size <- 512
# fit the model
history <- model %>% fit(
data_train_x, data_train_y,
batch_size = batch_size,
epochs = epochs,
verbose = 1,
validation_data = list(
data_val_x, data_val_y
)
)
# history plot
plot(history)# predict on train
data_train_pred <- model %>%
predict_classes(data_train_x) %>%
as.vector()
# predict on val
data_val_pred <- model %>%
predict_classes(data_val_x) %>%
as.vector()
# predict on test
data_test_pred <- model %>%
predict_classes(data_test_x) %>%
as.vector()# accuracy on data train
accuracy_vec(
truth = factor(data_train$oh_label,labels = c("sexism", "no")),
estimate = factor(data_train_pred, labels = c("sexism", "no"))
)#> [1] 0.8491673
# accuracy on data test
accuracy_vec(
truth = factor(data_test$oh_label,labels = c("sexism", "no")),
estimate = factor(data_test_pred, labels = c("sexism", "no"))
)#> [1] 0.7990937