Spam messages are a modern nuisance. This report compares spam and ham messages to uncover meaningful differences using TF-IDF, sentiment analysis (Bing & NRC), and bi-gram analysis.
# Load data
full_data <- read_csv("spam.csv", show_col_types = FALSE)
## New names:
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
# Clean columns
sms_data <- full_data %>%
select(v1, v2) %>%
rename(label = v1, message = v2)
# Tokenization & stopword removal
data <- sms_data %>%
unnest_tokens(word, message) %>%
anti_join(stop_words, by = "word")
word_counts <- data %>%
count(label, word, sort = TRUE)
tf_counts <- word_counts %>%
bind_tf_idf(word, label, n)
top10 <- tf_counts %>%
group_by(label) %>%
slice_max(tf_idf, n = 10, with_ties = FALSE)
top10$label <- factor(top10$label, levels = c("ham", "spam"))
ggplot(top10, aes(x = reorder_within(word, tf_idf, label), y = tf_idf, fill = label)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ label, scales = "free") +
labs(title = "Top 10 Words by TF-IDF",
y = "TF-IDF Score", x = NULL) +
scale_x_reordered() +
scale_fill_manual(values = c("ham" = "#4DB6AC", "spam" = "#FF8A65")) +
theme_minimal()
bing <- get_sentiments("bing")
sentiment_data <- data %>%
inner_join(bing)
## Joining with `by = join_by(word)`
sentiment_ham <- sentiment_data %>% filter(label == "ham") %>% count(word, sentiment, sort = TRUE)
sentiment_spam <- sentiment_data %>% filter(label == "spam") %>% count(word, sentiment, sort = TRUE)
# Wordclouds
sentiment_ham %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("steelblue", "orange"), max.words = 100)
sentiment_spam %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("steelblue", "orange"), max.words = 100)
nrc <- read.delim(
"NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
header = FALSE,
skip = 45,
col.names = c("word", "sentiment", "value"),
stringsAsFactors = FALSE
) %>%
filter(value == 1) %>%
select(-value)
ham_nrc <- data %>% filter(label == "ham") %>% inner_join(nrc)
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 9802 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
spam_nrc <- data %>% filter(label == "spam") %>% inner_join(nrc)
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 26 of `x` matches multiple rows in `y`.
## ℹ Row 8249 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ham_sentiment_count <- ham_nrc %>% count(sentiment, sort = TRUE)
spam_sentiment_count <- spam_nrc %>% count(sentiment, sort = TRUE)
# Plot ham
ggplot(ham_sentiment_count, aes(x = 2, y = n, fill = sentiment)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")), position = position_stack(vjust = 0.5)) +
labs(title = "NRC Sentiment - HAM") +
theme_void() +
xlim(0.5, 2.5) +
theme_light()
# Plot spam
ggplot(spam_sentiment_count, aes(x = 2, y = n, fill = sentiment)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")), position = position_stack(vjust = 0.5)) +
labs(title = "NRC Sentiment - SPAM") +
theme_void() +
xlim(0.5, 2.5) +
theme_light()
spam_bigram <- sms_data %>%
filter(label == "spam") %>%
unnest_tokens(bigram, message, token = "ngrams", n = 2) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 4)
graph_spam <- graph_from_data_frame(spam_bigram)
ham_bigram <- sms_data %>%
filter(label == "ham") %>%
unnest_tokens(bigram, message, token = "ngrams", n = 2) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 4)
graph_ham <- graph_from_data_frame(ham_bigram)
## Warning: In `d`, `NA` elements were replaced with string "NA".
# Plot spam
graph_spam %>%
ggraph(layout = "fr") +
geom_edge_link(alpha = 0.8) +
geom_node_point(color = "#E91E63", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
labs(title = "Bi-gram Network: SPAM") +
theme_void()
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Plot ham
graph_ham %>%
ggraph(layout = "fr") +
geom_edge_link(alpha = 0.8) +
geom_node_point(color = "#3F51B5", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
labs(title = "Bi-gram Network: HAM") +
theme_void()
The analysis highlights sharp differences in lexical and emotional content between spam and ham messages. While spam content exploits persuasive, emotionally charged words to drive action, ham tends to reflect more neutral, conversational tones. This distinction is especially visible in sentiment plots and word networks. Such analysis has potential applications in automated spam filters and social media moderation systems.
Mohammad, Saif M., and Turney, Peter D. (2013). NRC Emotion Lexicon. National Research Council Canada.
https://rpubs.com/nocoding/1197630
https://rpubs.com/BYUN_JUNHYUK_CAU/1197956