data_emoji = read_excel(here::here("data/emoji_examples.xlsx")) %>%
mutate(id = as.character(row_number()))
data_text = read_excel(here::here("data/text_examples.xlsx"))%>%
mutate(id = as.character(row_number()))
data_training = read_excel(here::here("data/training_test.xlsx"))%>%
mutate(id = as.character(row_number()))
data_emoji_scores =get_perspective_df(data_emoji, text_col = "text")
## # A tibble: 50 x 1
## result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.334 0.120 0.122 0.320 0.179
## 2 0.392 0.226 0.311 0.202 0.375
## 3 0.0897 0.0324 0.0571 0.0685 0.0294
## 4 0.156 0.0812 0.0918 0.125 0.136
## 5 0.0923 0.0383 0.109 0.0757 0.0675
## 6 0.611 0.358 0.426 0.342 0.565
## 7 0.817 0.683 0.922 0.806 0.807
## 8 0.945 0.842 0.858 0.933 0.927
## 9 0.182 0.106 0.0605 0.122 0.154
## 10 0.676 0.600 0.606 0.491 0.643
## # ... with 40 more rows
data_text_scores =get_perspective_df(data_text, text_col = "text")
## # A tibble: 50 x 1
## result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.852 0.793 0.943 0.848 0.784
## 2 0.445 0.196 0.391 0.532 0.104
## 3 0.155 0.0714 0.109 0.142 0.0687
## 4 0.311 0.139 0.137 0.340 0.0920
## 5 0.296 0.183 0.437 0.246 0.140
## 6 0.897 0.806 0.954 0.877 0.816
## 7 0.744 0.637 0.879 0.767 0.680
## 8 0.769 0.680 0.889 0.766 0.693
## 9 0.767 0.668 0.877 0.787 0.805
## 10 0.975 0.907 0.968 0.947 0.975
## # ... with 40 more rows
data_training_scores =get_perspective_df(data_training, text_col = "text")
## # A tibble: 100 x 1
## result$TOXICITY $SEVERE_TOXICITY $IDENTITY_ATTACK $INSULT $PROFANITY
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0897 0.0324 0.0571 0.0685 0.0294
## 2 0.476 0.418 0.399 0.307 0.509
## 3 0.150 0.0689 0.0523 0.0759 0.134
## 4 0.445 0.196 0.391 0.532 0.104
## 5 0.769 0.680 0.889 0.766 0.693
## 6 0.436 0.185 0.178 0.474 0.169
## 7 0.916 0.842 0.558 0.891 0.927
## 8 0.117 0.0351 0.0840 0.0991 0.0517
## 9 0.933 0.939 0.922 0.909 0.931
## 10 0.409 0.190 0.185 0.453 0.317
## # ... with 90 more rows
data_training_scores
data_emoji_scores %>%
left_join(data_emoji, by = "id") %>%
select(- text.x, -text.y, -id) %>%
pivot_longer(!class, names_to= "attribute", values_to = "score") %>%
ggplot(aes(y = score, x=attribute ))+
geom_boxplot(aes(fill=as.factor(class))) +
theme_minimal()+
theme(axis.text.x=element_text(angle=90,hjust=1))+
labs(title = "Emoji Dataset Perspective Scores boxplots", fill = "class")

data_text_scores %>%
left_join(data_text, by = "id") %>%
select(- text.x, -text.y, -id) %>%
pivot_longer(!class, names_to= "attribute", values_to = "score") %>%
ggplot(aes(y = score, x=attribute ))+
geom_boxplot(aes(fill=as.factor(class))) +
theme_minimal()+
theme(axis.text.x=element_text(angle=90,hjust=1))+
labs(title = "Text Dataset Perspective Scores boxplots", fill = "class")

data_training_scores %>%
left_join(data_training, by = "id") %>%
select(- text.x, -text.y, -id) %>%
pivot_longer(!class, names_to= "attribute", values_to = "score") %>%
ggplot(aes(y = score, x=attribute ))+
geom_boxplot(aes(fill=as.factor(class))) +
theme_minimal()+
theme(axis.text.x=element_text(angle=90,hjust=1))+
labs(title = "Training Dataset Perspective Scores boxplots", fill = "class")

data_training_scores %>%
left_join(data_training, by = "id") %>%
select(- text.y) %>%
write_excel_csv(here::here("data/training_data_perspective_scored.csv"))
Experimental Ensemble Model
# initial split
set.seed(1353)
training_scores_split = data_training_scores %>%
left_join(data_training, by = "id") %>%
select( -text.y, -id) %>%
mutate(class= as.factor(class)) %>%
initial_split(prop = 0.75,
strata = class,)
# Create training data
train <- training_scores_split %>%
training()
# Create testing data
test <- training_scores_split %>%
testing()
# Number of rows in train and test dataset
nrow(train)
## [1] 76
nrow(test)
## [1] 24
lm<- logistic_reg() %>%
# Set the engine
set_engine("glm") %>%
# Set the mode
set_mode("classification") %>%
# Fit the model
fit(class~., data = train %>% select(-text.x))
# Class prediction
pred_class <- predict(lm,
new_data = test,
type = "class")
pred_proba <- predict(lm,
new_data = test,
type = "prob")
results = test %>%
select(text.x, class) %>%
bind_cols(pred_class, pred_proba)
conf_mat(results, truth = class,
estimate = .pred_class)
## Truth
## Prediction 0 1
## 0 10 6
## 1 2 6
accuracy(results, truth = class,
estimate = .pred_class)
precision(results, truth = class,
estimate = .pred_class)
recall(results, truth = class,
estimate = .pred_class)
f_meas(results, truth = class,
estimate = .pred_class)
mcc(results, truth = class,
estimate = .pred_class)