Executive Summary

Spam messages are a modern nuisance. This report compares spam and ham messages to uncover meaningful differences using TF-IDF, sentiment analysis (Bing & NRC), and bi-gram analysis.

Data Loading and Preprocessing

# Load data
full_data <- read_csv("spam.csv", show_col_types = FALSE)

## New names:
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`

# Clean columns
sms_data <- full_data %>%
  select(v1, v2) %>%
  rename(label = v1, message = v2)

# Tokenization & stopword removal
data <- sms_data %>%
  unnest_tokens(word, message) %>%
  anti_join(stop_words, by = "word")

TF-IDF Analysis

word_counts <- data %>%
  count(label, word, sort = TRUE)

tf_counts <- word_counts %>%
  bind_tf_idf(word, label, n)

top10 <- tf_counts %>%
  group_by(label) %>%
  slice_max(tf_idf, n = 10, with_ties = FALSE)

top10$label <- factor(top10$label, levels = c("ham", "spam"))

ggplot(top10, aes(x = reorder_within(word, tf_idf, label), y = tf_idf, fill = label)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ label, scales = "free") +
  labs(title = "Top 10 Words by TF-IDF",
       y = "TF-IDF Score", x = NULL) +
  scale_x_reordered() +
  scale_fill_manual(values = c("ham" = "#4DB6AC", "spam" = "#FF8A65")) +
  theme_minimal()

Bing Sentiment Wordcloud

bing <- get_sentiments("bing")

sentiment_data <- data %>%
  inner_join(bing)

## Joining with `by = join_by(word)`

sentiment_ham <- sentiment_data %>% filter(label == "ham") %>% count(word, sentiment, sort = TRUE)
sentiment_spam <- sentiment_data %>% filter(label == "spam") %>% count(word, sentiment, sort = TRUE)

# Wordclouds
sentiment_ham %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("steelblue", "orange"), max.words = 100)

sentiment_spam %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("steelblue", "orange"), max.words = 100)

NRC Sentiment Donut Plot

nrc <- read.delim(
  "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
  header = FALSE,
  skip = 45,
  col.names = c("word", "sentiment", "value"),
  stringsAsFactors = FALSE
) %>%
  filter(value == 1) %>%
  select(-value)

ham_nrc <- data %>% filter(label == "ham") %>% inner_join(nrc)

## Joining with `by = join_by(word)`

## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 9802 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

spam_nrc <- data %>% filter(label == "spam") %>% inner_join(nrc)

## Joining with `by = join_by(word)`

## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 26 of `x` matches multiple rows in `y`.
## ℹ Row 8249 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

ham_sentiment_count <- ham_nrc %>% count(sentiment, sort = TRUE)
spam_sentiment_count <- spam_nrc %>% count(sentiment, sort = TRUE)

# Plot ham
ggplot(ham_sentiment_count, aes(x = 2, y = n, fill = sentiment)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")), position = position_stack(vjust = 0.5)) +
  labs(title = "NRC Sentiment - HAM") +
  theme_void() +
  xlim(0.5, 2.5) +
  theme_light()

# Plot spam
ggplot(spam_sentiment_count, aes(x = 2, y = n, fill = sentiment)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")), position = position_stack(vjust = 0.5)) +
  labs(title = "NRC Sentiment - SPAM") +
  theme_void() +
  xlim(0.5, 2.5) +
  theme_light()

Bi-gram Network

spam_bigram <- sms_data %>%
  filter(label == "spam") %>%
  unnest_tokens(bigram, message, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE) %>%
  filter(n > 4)

graph_spam <- graph_from_data_frame(spam_bigram)

ham_bigram <- sms_data %>%
  filter(label == "ham") %>%
  unnest_tokens(bigram, message, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE) %>%
  filter(n > 4)

graph_ham <- graph_from_data_frame(ham_bigram)

## Warning: In `d`, `NA` elements were replaced with string "NA".

# Plot spam
graph_spam %>%
  ggraph(layout = "fr") +
  geom_edge_link(alpha = 0.8) +
  geom_node_point(color = "#E91E63", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  labs(title = "Bi-gram Network: SPAM") +
  theme_void()

## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Plot ham
graph_ham %>%
  ggraph(layout = "fr") +
  geom_edge_link(alpha = 0.8) +
  geom_node_point(color = "#3F51B5", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  labs(title = "Bi-gram Network: HAM") +
  theme_void()

Conclusion

The analysis highlights sharp differences in lexical and emotional content between spam and ham messages. While spam content exploits persuasive, emotionally charged words to drive action, ham tends to reflect more neutral, conversational tones. This distinction is especially visible in sentiment plots and word networks. Such analysis has potential applications in automated spam filters and social media moderation systems.

References

Mohammad, Saif M., and Turney, Peter D. (2013). NRC Emotion Lexicon. National Research Council Canada.

https://rpubs.com/nocoding/1197630
https://rpubs.com/BYUN_JUNHYUK_CAU/1197956

Detecting Spam in SMS Messages using Text Analysis

JUNHYUK LEE

2025.06.13