data_emoji = read_excel(here::here("data/emoji_examples.xlsx")) %>% 
  mutate(id = as.character(row_number()))
data_text = read_excel(here::here("data/text_examples.xlsx"))%>% 
   mutate(id = as.character(row_number()))
data_training = read_excel(here::here("data/training_test.xlsx"))%>% 
   mutate(id = as.character(row_number()))
data_emoji_scores =get_perspective_df(data_emoji, text_col = "text")
## # A tibble: 50 x 1
##    result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
##              <dbl>            <dbl>            <dbl>   <dbl>      <dbl>
##  1          0.334            0.120            0.122   0.320      0.179 
##  2          0.392            0.226            0.311   0.202      0.375 
##  3          0.0897           0.0324           0.0571  0.0685     0.0294
##  4          0.156            0.0812           0.0918  0.125      0.136 
##  5          0.0923           0.0383           0.109   0.0757     0.0675
##  6          0.611            0.358            0.426   0.342      0.565 
##  7          0.817            0.683            0.922   0.806      0.807 
##  8          0.945            0.842            0.858   0.933      0.927 
##  9          0.182            0.106            0.0605  0.122      0.154 
## 10          0.676            0.600            0.606   0.491      0.643 
## # ... with 40 more rows
data_text_scores =get_perspective_df(data_text, text_col = "text")
## # A tibble: 50 x 1
##    result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
##              <dbl>            <dbl>            <dbl>   <dbl>      <dbl>
##  1           0.852           0.793             0.943   0.848     0.784 
##  2           0.445           0.196             0.391   0.532     0.104 
##  3           0.155           0.0714            0.109   0.142     0.0687
##  4           0.311           0.139             0.137   0.340     0.0920
##  5           0.296           0.183             0.437   0.246     0.140 
##  6           0.897           0.806             0.954   0.877     0.816 
##  7           0.744           0.637             0.879   0.767     0.680 
##  8           0.769           0.680             0.889   0.766     0.693 
##  9           0.767           0.668             0.877   0.787     0.805 
## 10           0.975           0.907             0.968   0.947     0.975 
## # ... with 40 more rows
data_training_scores =get_perspective_df(data_training, text_col = "text")
## # A tibble: 100 x 1
##    result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
##              <dbl>            <dbl>            <dbl>   <dbl>      <dbl>
##  1          0.0897           0.0324           0.0571  0.0685     0.0294
##  2          0.476            0.418            0.399   0.307      0.509 
##  3          0.150            0.0689           0.0523  0.0759     0.134 
##  4          0.445            0.196            0.391   0.532      0.104 
##  5          0.769            0.680            0.889   0.766      0.693 
##  6          0.436            0.185            0.178   0.474      0.169 
##  7          0.916            0.842            0.558   0.891      0.927 
##  8          0.117            0.0351           0.0840  0.0991     0.0517
##  9          0.933            0.939            0.922   0.909      0.931 
## 10          0.409            0.190            0.185   0.453      0.317 
## # ... with 90 more rows
data_training_scores
data_emoji_scores %>% 
  left_join(data_emoji, by = "id") %>% 
  select(- text.x, -text.y, -id) %>% 
  pivot_longer(!class, names_to= "attribute", values_to = "score") %>% 
  ggplot(aes(y = score, x=attribute ))+
  geom_boxplot(aes(fill=as.factor(class))) + 
  theme_minimal()+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  labs(title = "Emoji Dataset Perspective Scores boxplots", fill = "class")

data_text_scores %>% 
  left_join(data_text, by = "id") %>% 
  select(- text.x, -text.y, -id) %>% 
  pivot_longer(!class, names_to= "attribute", values_to = "score") %>% 
  ggplot(aes(y = score, x=attribute ))+
  geom_boxplot(aes(fill=as.factor(class))) + 
  theme_minimal()+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  labs(title = "Text Dataset Perspective Scores boxplots", fill = "class")

data_training_scores %>% 
  left_join(data_training, by = "id") %>% 
  select(- text.x, -text.y, -id) %>% 
  pivot_longer(!class, names_to= "attribute", values_to = "score") %>% 
  ggplot(aes(y = score, x=attribute ))+
  geom_boxplot(aes(fill=as.factor(class))) + 
  theme_minimal()+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  labs(title = "Training Dataset Perspective Scores boxplots", fill = "class")

data_training_scores %>% 
  left_join(data_training, by = "id") %>% 
  select(- text.y) %>% 
  write_excel_csv(here::here("data/training_data_perspective_scored.csv"))

Experimental Ensemble Model

# initial split
set.seed(1353)
training_scores_split = data_training_scores %>% 
  left_join(data_training, by = "id") %>% 
  select( -text.y, -id) %>% 
  mutate(class= as.factor(class)) %>% 
  initial_split(prop = 0.75,
                strata = class,)

# Create training data
train <- training_scores_split %>%
                    training()

# Create testing data
test <- training_scores_split %>%
                    testing()

# Number of rows in train and test dataset
nrow(train)
## [1] 76
nrow(test)
## [1] 24
lm<- logistic_reg() %>%
        # Set the engine
        set_engine("glm") %>%
        # Set the mode
        set_mode("classification") %>%
        # Fit the model
        fit(class~., data = train %>% select(-text.x))
# Class prediction
pred_class <- predict(lm,
                      new_data = test,
                      type = "class")

pred_proba <- predict(lm,
                      new_data = test,
                      type = "prob")


results = test %>%
  select(text.x, class) %>%
  bind_cols(pred_class, pred_proba)
conf_mat(results, truth = class,
         estimate = .pred_class)
##           Truth
## Prediction  0  1
##          0 10  6
##          1  2  6
accuracy(results, truth = class,
         estimate = .pred_class)
precision(results, truth = class,
         estimate = .pred_class)
recall(results, truth = class,
         estimate = .pred_class)
f_meas(results, truth = class,
         estimate = .pred_class)
mcc(results, truth = class,
         estimate = .pred_class)