1 Data Preparation

library(keras)
library(RVerbalExpressions)
library(magrittr)
library(textclean)
library(tidyverse)
library(tidytext)
library(rsample)
library(yardstick)
library(caret)

# Please run the code down below
twitter_raw <- read.csv("data-input/twitter_sexism_parsed_dataset.csv",
                    stringsAsFactors = FALSE,
                    encoding = "UTF-8") 

head(twitter_raw)

#>                   index                    id
#> 1 5.35198627292254E+017 5.35198627292254E+017
#> 2 5.75984924030714E+017 5.75984924030714E+017
#> 3  5.7233536016588E+017  5.7233536016588E+017
#> 4 5.72337925708374E+017 5.72337925708374E+017
#> 5 4.43033024528011E+017 4.43033024528011E+017
#> 6 5.69577286308987E+017 5.69577286308987E+017
#>                                                                                                                Text
#> 1 RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together.
#> 2                                                                  There's some very hate able teams this year #MKR
#> 3           RT @The_Eccles: "Everyone underestimated us" \nWe still do, as well as underestimating the judging #MKR
#> 4                   RT @NOTLukeDarcy: did @Channel7 or #MKR actually check if any of these people could cook? #WITB
#> 5 No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it.
#> 6                                  RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy
#>   Annotation oh_label
#> 1     sexism        1
#> 2       none        0
#> 3       none        0
#> 4       none        0
#> 5     sexism        1
#> 6     sexism        1

glimpse(twitter_raw)

#> Rows: 14,881
#> Columns: 5
#> $ index      <chr> "5.35198627292254E+017", "5.75984924030714E+017", "5.723353…
#> $ id         <chr> "5.35198627292254E+017", "5.75984924030714E+017", "5.723353…
#> $ Text       <chr> "RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if yo…
#> $ Annotation <chr> "sexism", "none", "none", "none", "sexism", "sexism", "none…
#> $ oh_label   <int> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,…

twitter_clean <- twitter_raw %>% 
  select(Text, oh_label) %>%
  mutate(oh_label = as.factor(oh_label))

head(twitter_clean)

#>                                                                                                                Text
#> 1 RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together.
#> 2                                                                  There's some very hate able teams this year #MKR
#> 3           RT @The_Eccles: "Everyone underestimated us" \nWe still do, as well as underestimating the judging #MKR
#> 4                   RT @NOTLukeDarcy: did @Channel7 or #MKR actually check if any of these people could cook? #WITB
#> 5 No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it.
#> 6                                  RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy
#>   oh_label
#> 1        1
#> 2        0
#> 3        0
#> 4        0
#> 5        1
#> 6        1

text <- twitter_clean %>% filter(oh_label == "1")

head(text$Text,5)

#> [1] "RT @BeepsS: @senna1 @BeepsS: I'm not sexist but fuck if you're a woman and you can't Cook get your shit together."
#> [2] "No, you don't. @Shut_Up_Jeff: I thought of a really funny joke and I promise I'm not sexist but I have to say it."
#> [3] "RT @Wateronatrain: @MT8_9 You might like this http://t.co/c9m2pFmFJ3 #patriarchy"                                 
#> [4] "RT @MetalBarbieDoll: But yea, apparently #GamerGate women are sock puppets....lol. http://t.co/4WouDly45a"        
#> [5] ":D @nkrause11 Dudes who go to culinary school: #why #findawife #notsexist :)"

1.1 Mengecek Distribusi Label

# Calculate label frequencies
label_freq <- table(twitter_clean$oh_label)

# Convert label frequencies to a data frame for plotting
label_freq_df <- data.frame(label = names(label_freq), frequency = label_freq)

# Create a bar plot
ggplot(data = label_freq_df, aes(x = label, y = frequency.Freq, fill = label)) +
  geom_bar(stat = "identity") +
  labs(title = "Label Frequency",
       x = "Label",
       y = "Frequency") +
  theme_minimal()

Insigth Label Pada dataset ini imbalance, oleh karenanya akan dilakukan undersampling agar model tidak overfiting

# Step 2: Check label imbalance
label_counts <- table(twitter_clean$oh_label) # Assuming "label_column" is the column name containing the class labels
print(label_counts)

#> 
#>     0     1 
#> 11501  3377

# Step 3: Undersampling (if needed)
# Find the minority class
minority_class <- names(which.min(label_counts))

# Find the number of samples in the minority class
minority_count <- min(label_counts)

# Filter data for each class and perform undersampling
data_undersampled <- twitter_clean %>%
  drop_na(oh_label) %>%
  group_by(oh_label) %>%
  sample_n(size = minority_count, replace = TRUE) %>%
  ungroup()

# Print the undersampled dataset
print(data_undersampled)

#> # A tibble: 6,754 × 2
#>    Text                                                                 oh_label
#>    <chr>                                                                <fct>   
#>  1 THISE NOISES WHILE HE IS WAGGING HIS TAIL. wtf http://t.co/YEkXYKq3… 0       
#>  2 It's Kat &amp; Andre vs Katie &amp; Nikki. In the last redemption r… 0       
#>  3 @MaxBlumenthal @yaghiadam75 http://t.co/rLWZo41yxJ                   0       
#>  4 @PhilipMills8 @PsychBarakat @jncatron And there is no Palestine and… 0       
#>  5 RT @Nayadrikj: GO AWAY PHOTOBOMB, YOU'RE CREEPY #ThanksGrandma http… 0       
#>  6 @banhammor it's not true. see last 2 tweets.                         0       
#>  7 Can't ask campers to wait for breakfast. Gotta give them what they … 0       
#>  8 @cooperq and he is defending it right now. ugh.                      0       
#>  9 Smears on a plate, for real ?  #MKR                                  0       
#> 10 If your company has an abuse, social media, or community team, we w… 0       
#> # ℹ 6,744 more rows

2 Text Cleansing

2.1 Twitter Dataset Cleansing Pattern

Menggunakan Cara pembersihan text yang umumnya digunakan pada dataset Twitter, referensi dari algotech

dollar <- rx() %>% 
  rx_find("$")
number <- rx_digit()
punctuation <- rx_punctuation()
exclamation <- rx() %>% 
  rx_find(value = "!") %>% 
  rx_one_or_more()
question <- rx() %>% 
  rx_find(value = "?") %>% 
  rx_one_or_more()
hashtag <- rx() %>% 
  rx_find(value = "#") %>% 
  rx_alnum() %>% 
  rx_one_or_more()
mention <- rx() %>% 
  rx_find(value = "@") %>% 
  rx_alnum() %>% 
  rx_one_or_more()

2.2 Twitter Clean

twitter_clean <- data_undersampled %>% 
  mutate(
    text_clean = Text %>% 
      replace_url() %>% 
      replace_emoji() %>% 
      replace_emoticon() %>% 
      replace_html() %>% 
      str_remove_all(pattern = mention) %>% 
      str_remove_all(pattern = hashtag) %>% 
      replace_contraction() %>% 
      replace_word_elongation() %>% 
      str_replace_all(pattern = question, replacement = "questionmark") %>% 
      str_replace_all(pattern = exclamation, replacement = "exclamationmark") %>% 
      str_remove_all(pattern = punctuation) %>% 
      str_remove_all(pattern = number) %>% 
      str_remove_all(pattern = dollar) %>% 
      str_to_lower() %>% 
      str_squish()
  )

twitter_clean %>% 
  select(Text, text_clean) %>% 
  sample_n(5)

#> # A tibble: 5 × 2
#>   Text                                                                text_clean
#>   <chr>                                                               <chr>     
#> 1 @gamergatetxt @Cernowatch @PlayDangerously gaters trigger too many… gaters tr…
#> 2 @TheGags29 You make many baseless assumptions and decide they are … you make …
#> 3 RT @Catherine_Sarah: THAT IS THE DESERT!!! oh gawd if they don't g… rt that i…
#> 4 Listen, you're sexist. @TommyJohn15 Listen, I'm not sexist, but ev… listen yo…
#> 5 RT @victorymonk: @MGTOWKnight @highwiregirl oh, snap!! Here is wha… rt oh sna…

data <- twitter_clean %>% 
  select(text_clean, oh_label) %>% 
  na.omit()

head(data, 10)

#> # A tibble: 10 × 2
#>    text_clean                                                           oh_label
#>    <chr>                                                                <fct>   
#>  1 "thise noises while he is wagging his tail wtf"                      0       
#>  2 "it is kat andre vs katie nikki in the last redemption round instan… 0       
#>  3 ""                                                                   0       
#>  4 "and there is no palestine and never has been a palestine"           0       
#>  5 "rt go away photobomb you are creepy"                                0       
#>  6 "it is not true see last tweets"                                     0       
#>  7 "can not ask campers to wait for breakfast gotta give them what the… 0       
#>  8 "and he is defending it right now ugh"                               0       
#>  9 "smears on a plate for real questionmark"                            0       
#> 10 "if your company has an abuse social media or community team we wan… 0

library(tm)
tdm <- TermDocumentMatrix(data$text_clean)
length(rownames(tdm))

#> [1] 8828

3 Data Preparataion For Model

num_words <- 1024 

# prepare tokenizers
tokenizer <- text_tokenizer(num_words = num_words,
                            lower = TRUE) %>% 
  fit_text_tokenizer(data$text_clean)

set.seed(100)
intrain <- initial_split(data = data, prop = 0.8, strata = "oh_label")

data_train <- training(intrain)
data_test <- testing(intrain)

set.seed(100)
inval <- initial_split(data = data_test, prop = 0.5, strata = "oh_label")

data_val <- training(inval)
data_test <- testing(inval)

maxlen <- max(str_count(data$text_clean, "\\w+")) + 1 
paste("maxiumum length words in data:", maxlen)

#> [1] "maxiumum length words in data: 33"

# prepare x
data_train_x <- texts_to_sequences(tokenizer, data_train$text_clean) %>%
  pad_sequences(maxlen = maxlen)

data_val_x <- texts_to_sequences(tokenizer, data_val$text_clean) %>%
  pad_sequences(maxlen = maxlen)

data_test_x <- texts_to_sequences(tokenizer, data_test$text_clean) %>%
  pad_sequences(maxlen = maxlen)

# prepare y
data_train_y <- to_categorical(data_train$oh_label, num_classes = 2)
data_val_y <- to_categorical(data_val$oh_label, num_classes = 2)
data_test_y <- to_categorical(data_test$oh_label, num_classes = 2)

4 Modeling

# initiate keras model sequence
model <- keras_model_sequential()

# model
model %>%
  # layer input
  layer_embedding(
    name = "input",
    input_dim = num_words,
    input_length = maxlen,
    output_dim = 32, 
    embeddings_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
  ) %>%
  # layer dropout
  layer_dropout(
    name = "embedding_dropout",
    rate = 0.5
  ) %>%
  # layer lstm 1
  layer_lstm(
    name = "lstm",
    units = 256,
    dropout = 0.2,
    recurrent_dropout = 0.2,
    return_sequences = FALSE, 
    recurrent_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2),
    kernel_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
  ) %>%
  # layer output
  layer_dense(
    name = "output",
    units = 2,
    activation = "softmax", 
    kernel_initializer = initializer_random_uniform(minval = -0.05, maxval = 0.05, seed = 2)
  )

# compile the model
model %>% compile(
  optimizer = "adam",
  metrics = "accuracy",
  loss = "categorical_crossentropy"
)

# model summary
summary(model)

#> Model: "sequential"
#> ________________________________________________________________________________
#> Layer (type)                        Output Shape                    Param #     
#> ================================================================================
#> input (Embedding)                   (None, 33, 32)                  32768       
#> ________________________________________________________________________________
#> embedding_dropout (Dropout)         (None, 33, 32)                  0           
#> ________________________________________________________________________________
#> lstm (LSTM)                         (None, 256)                     295936      
#> ________________________________________________________________________________
#> output (Dense)                      (None, 2)                       514         
#> ================================================================================
#> Total params: 329,218
#> Trainable params: 329,218
#> Non-trainable params: 0
#> ________________________________________________________________________________

# model fit settings
epochs <- 10
batch_size <- 512

# fit the model
history <- model %>% fit(
  data_train_x, data_train_y,
  batch_size = batch_size, 
  epochs = epochs,
  verbose = 1,
  validation_data = list(
    data_val_x, data_val_y
  )
)

# history plot
plot(history)

# predict on train
data_train_pred <- model %>%
  predict_classes(data_train_x) %>%
  as.vector()

# predict on val
data_val_pred <- model %>%
  predict_classes(data_val_x) %>%
  as.vector()

# predict on test
data_test_pred <- model %>%
  predict_classes(data_test_x) %>%
  as.vector()

5 Eval

# accuracy on data train
accuracy_vec(
 truth = factor(data_train$oh_label,labels = c("sexism", "no")),
 estimate = factor(data_train_pred, labels = c("sexism", "no"))
)

#> [1] 0.8491673

# accuracy on data test
accuracy_vec(
 truth = factor(data_test$oh_label,labels = c("sexism", "no")),
 estimate = factor(data_test_pred, labels = c("sexism", "no"))
)

#> [1] 0.7990937

Bank Costumer Segmentation

Rizki Aldiansyah

July 31, 2023