knitr::include_graphics("cyberbullying.jpg")As social media usage becomes increasingly prevalent in every age group, a vast majority of citizens rely on this essential medium for day-to-day communication. Social media’s ubiquity means that cyberbullying can effectively impact anyone at any time or anywhere, and the relative anonymity of the internet makes such personal attacks more difficult to stop than traditional bullying.
On April 15th, 2020, UNICEF issued a warning in response to the increased risk of cyberbullying during the COVID-19 pandemic due to widespread school closures, increased screen time, and decreased face-to-face social interaction. The statistics of cyberbullying are outright alarming: 36.5% of middle and high school students have felt cyberbullied and 87% have observed cyberbullying, with effects ranging from decreased academic performance to depression to suicidal thoughts.
In light of all of this, this dataset contains more than 47000 tweets labelled according to the class of cyberbullying:
Age
Ethnicity
Gender
Religion
Other type of cyberbullying
Not cyberbullying
The data has been balanced in order to contain ~8000 of each class.
bully <- read.csv("cyberbullying_tweets.csv")str(bully)## 'data.frame': 47692 obs. of 2 variables:
## $ tweet_text : chr "In other words #katandandre, your food was crapilicious! #mkr" "Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc" "@XochitlSuckkks a classy whore? Or more red velvet cupcakes?" "@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter." ...
## $ cyberbullying_type: chr "not_cyberbullying" "not_cyberbullying" "not_cyberbullying" "not_cyberbullying" ...
bully_clean <- bully %>%
mutate(cyberbullying_type = as.factor(cyberbullying_type))The dataset consist of 2 variables and 47692 observations. The description of the variables are :
tweet_text : tweet message
cyberbullying_type : class of cyberbullying
Only choosing alphanumeric, number and space character in the tweet_text, and replace the non ASCII character.
bully_clean$tweet_text <- gsub("[^[:alnum:][:space:]]","", x = bully_clean$tweet_text)
bully_clean$tweet_text <- gsub("[][à ªâ]","", x = bully_clean$tweet_text)
bully_clean$tweet_text <- textclean::replace_non_ascii(bully_clean$tweet_text)Slicing the dataset by the cyberbullying_type.
bage <- bully_clean %>%
filter(cyberbullying_type == "age")
bethnic <- bully_clean %>%
filter(cyberbullying_type == "ethnicity")
bgender <- bully_clean %>%
filter(cyberbullying_type == "gender")
bother <- bully_clean %>%
filter(cyberbullying_type == "other_cyberbullying")
breligion <- bully_clean %>%
filter(cyberbullying_type == "religion")Convert to corpus.
bully_corpus <- VCorpus(VectorSource(bully_clean$tweet_text))
bage_corpus <- VCorpus(VectorSource(bage$tweet_text))
bethnic_corpus <- VCorpus(VectorSource(bethnic$tweet_text))
bgender_corpus <- VCorpus(VectorSource(bgender$tweet_text))
bother_corpus <- VCorpus(VectorSource(bother$tweet_text))
breligion_corpus <- VCorpus(VectorSource(breligion$tweet_text))For the full dataset, convert the corpus to lowercase, remove numbers, remove stopwords, remove punctuation, stem document, and lastly strip whitespace. Save to RDS, as this is a long processing time.
# bully_corpus_clean <- bully_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(removePunctuation) %>%
# tm_map(stemDocument) %>%
# tm_map(stripWhitespace)
# saveRDS(bully_corpus_clean, file = "bully_corpus_clean.RDS")
bully_corpus_clean <- readRDS("bully_corpus_clean.RDS")Same process with the sliced dataset, except using lemmatize insted of stem document. Saved to RDS.
# bage_corpus_clean <- bage_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(lemmatize_strings) %>%
# tm_map(PlainTextDocument) %>%
# tm_map(stripWhitespace)
#
# saveRDS(bage_corpus_clean, file = "bage_corpus_clean.RDS")
bage_corpus_clean <- readRDS("bage_corpus_clean.RDS")# bethnic_corpus_clean <- bethnic_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(lemmatize_strings) %>%
# tm_map(PlainTextDocument) %>%
# tm_map(stripWhitespace)
#
# saveRDS(bethnic_corpus_clean, file = "bethnic_corpus_clean.RDS")
bethnic_corpus_clean <- readRDS("bethnic_corpus_clean.RDS")# bgender_corpus_clean <- bgender_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(lemmatize_strings) %>%
# tm_map(PlainTextDocument) %>%
# tm_map(stripWhitespace)
#
# saveRDS(bgender_corpus_clean, file = "bgender_corpus_clean.RDS")
bgender_corpus_clean <- readRDS("bgender_corpus_clean.RDS")# bother_corpus_clean <- bother_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(lemmatize_strings) %>%
# tm_map(PlainTextDocument) %>%
# tm_map(stripWhitespace)
#
# saveRDS(bother_corpus_clean, file = "bother_corpus_clean.RDS")
bother_corpus_clean <- readRDS("bother_corpus_clean.RDS")# breligion_corpus_clean <- breligion_corpus %>%
# tm_map(content_transformer(tolower)) %>%
# tm_map(removeNumbers) %>%
# tm_map(removeWords, stopwords("english")) %>%
# tm_map(lemmatize_strings) %>%
# tm_map(PlainTextDocument) %>%
# tm_map(stripWhitespace)
#
# saveRDS(breligion_corpus_clean, file = "breligion_corpus_clean.RDS")
breligion_corpus_clean <- readRDS("breligion_corpus_clean.RDS")Random inspect the real data, corpus, and processed corpus
bully_clean[19430,]$cyberbullying_type## [1] religion
## 6 Levels: age ethnicity gender not_cyberbullying ... religion
bully_corpus[[19430]]$content## [1] "Lets well all stand tall Let us we all United against a Radical Islamic terrorism Time to united not against terrorism but also against those insiders who supports them nd funds them KapilMishraIND we r with u HinduUnitedAgainstTerror"
bully_corpus_clean[[19430]]$content## [1] "let well stand tall let us unit radic islam terror time unit terror also insid support nd fund kapilmishraind r u hinduunitedagainstterror"
bage[310,]$cyberbullying_type## [1] age
## 6 Levels: age ethnicity gender not_cyberbullying ... religion
bage_corpus[[310]]$content## [1] "This was the line that struck me too We have truly been abandoned in the sickest and most egregious manner And now they expect to bully us into returning to a normal school opening"
bage_corpus_clean[310]$content## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 93
inspect(bage_corpus_clean[[310]])## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 93
##
## line strike truly abandon sick egregious manner now expect bully us return normal school open
Convert Corpus to DocumentTermMatrix, convert again to dataframe, pivot longer to find the most used words of each class of cyberbullying. Again, save the output into RDS, as this is a long process. Lastly, visualize in word cloud.
# bage_dtm <- DocumentTermMatrix(bage_corpus_clean)
#
# bage_count <- as.data.frame(as.matrix(bage_dtm))
#
# bage_count_long <- pivot_longer(data = bage_count, cols = everything())
#
# final_bage <- bage_count_long %>%
# group_by(name) %>%
# summarise(tot = sum(value))
#saveRDS(final_bage, file = "final_bage.RDS")
final_bage <- readRDS("final_bage.RDS")
bage_cloud <- final_bage %>%
filter(tot >= 50) %>%
arrange(desc(tot))
head(bage_cloud)Most used words in Age Cyberbullying are, bully, school , high, girl, like, get.
wordcloud2(data=bage_cloud, size=4,color = "random-light", backgroundColor = "black")# bethnic_dtm <- DocumentTermMatrix(bethnic_corpus_clean)
#
# bethnic_count <- as.data.frame(as.matrix(bethnic_dtm))
#
# bethnic_count_long <- pivot_longer(data = bethnic_count, cols = everything())
#
# final_bethnic <- bethnic_count_long %>%
# group_by(name) %>%
# summarise(tot = sum(value))
#
# saveRDS(final_bethnic, file = "final_bethnic.RDS")
final_bethnic <- readRDS("final_bethnic.RDS")
bethnic_cloud <- final_bethnic %>%
filter(tot >= 50) %>%
arrange(desc(tot))
head(bethnic_cloud,10)In ethnic cyberbullying, black people are the targeted bully. While Obama is the most cyberbullied individual.
wordcloud2(data=bethnic_cloud, size = 3,color = "random-light", backgroundColor = "black")# bgender_dtm <- DocumentTermMatrix(bgender_corpus_clean)
#
# bgender_count <- as.data.frame(as.matrix(bgender_dtm))
#
# bgender_count_long <- pivot_longer(data = bgender_count, cols = everything())
#
# final_bgender <- bgender_count_long %>%
# group_by(name) %>%
# summarise(tot = sum(value))
#
# saveRDS(final_bgender, file = "final_bgender.RDS")
final_bgender <- readRDS("final_bgender.RDS")
bgender_cloud <- final_bgender %>%
filter(tot >= 50) %>%
arrange(desc(tot))
head(bgender_cloud)In gender class cyberbullying, gay people, and woman are the victim.
wordcloud2(data=bgender_cloud, size=3,color = "random-light", backgroundColor = "black")# breligion_dtm <- DocumentTermMatrix(breligion_corpus_clean)
#
# breligion_count <- as.data.frame(as.matrix(breligion_dtm))
#
# breligion_count_long <- pivot_longer(data = breligion_count, cols = everything())
#
# final_breligion <- breligion_count_long %>%
# group_by(name) %>%
# summarise(tot = sum(value))
#
# saveRDS(final_breligion, file = "final_breligion.RDS")
final_breligion <- readRDS("final_breligion.RDS")
breligion_cloud <- final_breligion %>%
filter(tot >= 50) %>%
arrange(desc(tot))
head(breligion_cloud)Most used words in Religion Cyberbullying are, muslim, idiot , christian, terrorist.
wordcloud2(data=breligion_cloud, size=2,color = "random-light", backgroundColor = "black")# bother_dtm <- DocumentTermMatrix(bother_corpus_clean)
#
# bother_count <- as.data.frame(as.matrix(bother_dtm))
#
# bother_count_long <- pivot_longer(data = bother_count, cols = everything())
#
# final_bother <- bother_count_long %>%
# group_by(name) %>%
# summarise(tot = sum(value))
#
# saveRDS(final_bother, file = "final_bother.RDS")
final_bother <- readRDS("final_bother.RDS")
bother_cloud <- final_bother %>%
filter(tot >= 50) %>%
arrange(desc(tot))
head(bother_cloud)In Other Class of Cyberbullying, bully is the most used word.
wordcloud2(data=bother_cloud, size=1.6,color = "random-light", backgroundColor = "black")Convert full dataset into DocumentTermMatrix, save to RDS.
# bully_dtm <- DocumentTermMatrix(bully_corpus_clean)
# saveRDS(bully_dtm, file = "bully_dtm.RDS")
bully_dtm <- readRDS("bully_dtm.RDS")
inspect(bully_dtm)## <<DocumentTermMatrix (documents: 47692, terms: 53132)>>
## Non-/sparse entries: 595834/2533375510
## Sparsity : 100%
## Maximal term length: 315
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bulli dumb fuck girl high joke like nigger peopl school
## 10923 0 0 2 0 0 7 1 0 0 0
## 1318 1 0 0 0 0 0 1 0 0 2
## 15622 0 0 1 0 0 9 1 0 3 0
## 24517 9 0 4 0 0 0 8 0 3 1
## 29206 10 0 4 1 0 0 6 0 3 0
## 30753 5 0 1 0 0 0 4 0 2 0
## 33725 8 0 0 3 3 0 3 0 1 8
## 44036 0 9 14 0 0 0 3 10 2 0
## 45166 0 5 6 0 0 0 0 6 2 0
## 47038 0 5 5 0 0 0 2 6 0 0
Split the dataset into train and test dataset with 75% - 25% proportion.
All Class of Cyberbullying are balance in the train dataset.
btrain_type %>%
table() %>%
prop.table()## .
## age ethnicity gender not_cyberbullying
## 0.1685538 0.1656183 0.1670720 0.1671839
## other_cyberbullying religion
## 0.1649473 0.1666247
There are 53.132 terms in the train dataset, we will find the most used terms with minimum frequency 100.
dim(btrain)## [1] 35769 53132
bully_freq <- findFreqTerms(x = btrain, lowfreq = 100)After subset, only 703 terms left.
btrain <- btrain[,bully_freq]
dim(btrain)## [1] 35769 703
The value of the term in btrain and btest in are in frequency. In order to count probability, we need to convert frequency into 1 or 0. A simple Bernoulli Converter will serve it.
bernoulli_conv <- function(x){
x <- as.factor(ifelse(x > 0, 1, 0))
return(x)
}Convert, save into RDS. This convertion part took 22 GB of RAM. And this Rmd was writen in ALL FLASH disk.
# btrain_bn <- apply(btrain, MARGIN = 2, FUN = bernoulli_conv)
# saveRDS(btrain_bn, file = "btrain_bn.RDS")
btrain_bn <- readRDS("btrain_bn.RDS")
# btest_bn <- apply(btest, MARGIN = 2, FUN = bernoulli_conv)
# saveRDS(btest_bn, file = "btest_bn.RDS")
# btest_bn <- readRDS("btest_bn.RDS")Create a Naive Bayes model with the train dataset.
bmodel_NB <- naiveBayes(x = btrain_bn, y = btrain_type, laplace = 1)Predict the model, save into RDS.
# bpred_NB <- predict(object = bmodel_NB, newdata = btest_bn, type = "class")
# saveRDS(bpred_NB, file = "bpred_NB.RDS")
bpred_NB <- readRDS("bpred_NB.RDS")Model evaluation with Confusion Matrix, model accuracy is at 77.7%. This is due to mis-predict on class not_cyberbullying and other_cyberbullying. Also the Precision (Pos Pred Value), on these class are at 49.5% and 53.5%. But on the other 4 class the Precision is very high :
class age 95.9%
class ethnicity 96.4%
class gender 91.5%
class religion 90.8%
conf_NB <- confusionMatrix(data = bpred_NB,
reference = btest_type)
conf_NB## Confusion Matrix and Statistics
##
## Reference
## Prediction age ethnicity gender not_cyberbullying
## age 1879 9 4 37
## ethnicity 8 1838 20 12
## gender 1 13 1406 46
## not_cyberbullying 60 78 279 1224
## other_cyberbullying 12 38 275 580
## religion 3 61 13 66
## Reference
## Prediction other_cyberbullying religion
## age 30 1
## ethnicity 23 5
## gender 58 13
## not_cyberbullying 690 140
## other_cyberbullying 1078 32
## religion 44 1847
##
## Overall Statistics
##
## Accuracy : 0.7777
## 95% CI : (0.7701, 0.7851)
## No Information Rate : 0.1709
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7332
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: age Class: ethnicity Class: gender
## Sensitivity 0.9572 0.9023 0.7041
## Specificity 0.9919 0.9931 0.9868
## Pos Pred Value 0.9587 0.9643 0.9148
## Neg Pred Value 0.9916 0.9801 0.9431
## Prevalence 0.1646 0.1708 0.1675
## Detection Rate 0.1576 0.1542 0.1179
## Detection Prevalence 0.1644 0.1599 0.1289
## Balanced Accuracy 0.9745 0.9477 0.8454
## Class: not_cyberbullying Class: other_cyberbullying
## Sensitivity 0.6229 0.56058
## Specificity 0.8748 0.90630
## Pos Pred Value 0.4953 0.53499
## Neg Pred Value 0.9216 0.91472
## Prevalence 0.1648 0.16128
## Detection Rate 0.1027 0.09041
## Detection Prevalence 0.2072 0.16900
## Balanced Accuracy 0.7488 0.73344
## Class: religion
## Sensitivity 0.9063
## Specificity 0.9811
## Pos Pred Value 0.9081
## Neg Pred Value 0.9807
## Prevalence 0.1709
## Detection Rate 0.1549
## Detection Prevalence 0.1706
## Balanced Accuracy 0.9437
Receiver-Operating Curve (ROC). Plot True Positive Rate (Sensitivity/Recall) with False Positive Rate (1-Specificity). All resulting class plot shows ┌── , which is a good curve.
# bpred_NBprob <- predict(object = bmodel_NB, newdata = btest_bn, type = "raw")
# saveRDS(bpred_NBprob, file = "bpred_NBprob.RDS")
bpred_NBprob <- readRDS("bpred_NBprob.RDS")
bpred_NB_roc_not <- prediction(predictions = bpred_NBprob[,4],
labels = as.numeric(btest_type == "not_cyberbullying"))
bpred_NB_roc_age <- prediction(predictions = bpred_NBprob[,1],
labels = as.numeric(btest_type == "age"))
bpred_NB_roc_ethnic <- prediction(predictions = bpred_NBprob[,2],
labels = as.numeric(btest_type == "ethnicity"))
bpred_NB_roc_gender <- prediction(predictions = bpred_NBprob[,3],
labels = as.numeric(btest_type == "gender"))
bpred_NB_roc_other <- prediction(predictions = bpred_NBprob[,5],
labels = as.numeric(btest_type == "other_cyberbullying"))
bpred_NB_roc_religion <- prediction(predictions = bpred_NBprob[,6],
labels = as.numeric(btest_type == "religion"))plot(performance(prediction.obj = bpred_NB_roc_not,
measure = "tpr",
x.measure = "fpr"), col = "red", lwd = 2)
plot(performance(prediction.obj = bpred_NB_roc_age,
measure = "tpr",
x.measure = "fpr"), add = TRUE, col = "blue", lwd = 2)
plot(performance(prediction.obj = bpred_NB_roc_ethnic,
measure = "tpr",
x.measure = "fpr"), add = TRUE, col = "green", lwd = 2)
plot(performance(prediction.obj = bpred_NB_roc_gender,
measure = "tpr",
x.measure = "fpr"), add = TRUE, col = "yellow", lwd = 2)
plot(performance(prediction.obj = bpred_NB_roc_other,
measure = "tpr",
x.measure = "fpr"), add = TRUE, col = "purple", lwd = 2)
plot(performance(prediction.obj = bpred_NB_roc_religion,
measure = "tpr",
x.measure = "fpr"), add = TRUE, col = "orange", lwd = 2)
legend(0.7, 0.55, legend=c("Not Bully", "Age", "Ethnic","Gender","Other","Religion"),
col=c("red", "blue", "green","yellow","purple","orange"), lty = 1, cex = 1.2, box.lty = 0)AUC ranges in value from 0 to 1. A model whose predictions are 100% wrong has an AUC of 0.0; one whose predictions are 100% correct has an AUC of 1.0. The AUC for :
class age 0.9969
class ethnicity 0.9936
class gender 0.9562
class not_cyberbullying 0.8897
class other_cyberbullying 0.9077
class religion 0.9901
All class AUC value > 0.5 and close to 1, indicating this is a good model.
bpred_NB_auc_not <- performance(prediction.obj = bpred_NB_roc_not, measure = "auc")
bpred_NB_auc_age <- performance(prediction.obj = bpred_NB_roc_age, measure = "auc")
bpred_NB_auc_ethnic <- performance(prediction.obj = bpred_NB_roc_ethnic, measure = "auc")
bpred_NB_auc_gender <- performance(prediction.obj = bpred_NB_roc_gender, measure = "auc")
bpred_NB_auc_other <- performance(prediction.obj = bpred_NB_roc_other, measure = "auc")
bpred_NB_auc_religion <- performance(prediction.obj = bpred_NB_roc_religion, measure = "auc")
df_auc <- as.data.frame(c("Not Bully", bpred_NB_auc_not@y.values))
names(df_auc)[1] <- "Bully_Type"
names(df_auc)[2] <- "AUC"
df_auc <- rbind(df_auc, c("Age", bpred_NB_auc_age@y.values))
df_auc <- rbind(df_auc, c("Ethnic", bpred_NB_auc_ethnic@y.values))
df_auc <- rbind(df_auc, c("Gender", bpred_NB_auc_gender@y.values))
df_auc <- rbind(df_auc, c("Other", bpred_NB_auc_other@y.values))
df_auc <- rbind(df_auc, c("Religion", bpred_NB_auc_religion@y.values))
ggplot(df_auc, aes(x = reorder(Bully_Type, AUC), y = AUC, label = round(AUC,4))) +
geom_col(aes(fill = AUC), show.legend = F) +
geom_text(vjust = 1) +
scale_fill_gradient(low = "orange", high = "red") +
labs(x = "Bully Type",
title = "AUC by Bully Type") +
theme_light()With minimum frequency Term at 100, this Naive Bayes model is doing a great job predicting cyberbullying class age, ethnicity, religion, and gender. To tune the model, we can set the frequency Term to a higher number.