movieDB <- read.csv("IMDB Dataset.csv")
tidy_reviews <- movieDB %>%
mutate(id = row_number()) %>%
unnest_tokens(word, review, drop = F) %>%
filter(!word %in% c("br")) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
## compare nrc score and bing score
load("nrc.rda")
nrc_sad <- nrc %>% filter(sentiment == "sadness")
bing <- get_sentiments("bing")
# calculate nrc sadness score
total_words <- tidy_reviews %>% count(id, name = "total_words")
sad_words <- tidy_reviews %>%
inner_join(nrc_sad, by = "word") %>%
count(id, name = "sad_words")
sad_score <- total_words %>%
left_join(sad_words, by = "id") %>%
mutate(sad_words = replace_na(sad_words, 0),
sadness_score = sad_words / total_words)
# calculate bing score
bing_score <- tidy_reviews %>%
select(-sentiment) %>%
inner_join(bing, by = "word") %>%
count(id, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(bing_score = (positive - negative) / (positive + negative + 1))
## Warning in inner_join(., bing, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1212127 of `x` matches multiple rows in `y`.
## ℹ Row 5781 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# combine sentiments label
sentiments <- movieDB %>% mutate(id = row_number()) %>% select(id, sentiment)
final_scores <- sad_score %>%
left_join(bing_score, by = "id") %>%
left_join(sentiments, by = "id")
# visualize
top_sad <- final_scores %>%
filter(sadness_score > quantile(sadness_score, 0.8))
ggplot(top_sad, aes(x = bing_score, fill = sentiment)) +
geom_histogram(bins = 40, position = "identity", alpha = 0.6, na.rm = TRUE) +
labs(title = "Sentiment Score Distribution (Top 20% Sadness Score, Using Bing Lexicon)",
x = "Bing Sentiment Score (Positive - Negative Ratio)",
y = "Number of Reviews") +
theme_minimal()

ggsave("images/nrc_vs_bing_sad_top20.png")
## Saving 7 x 5 in image
movieDB <- read.csv("IMDB Dataset.csv")
tidy_reviews <- movieDB %>%
mutate(id = row_number()) %>%
unnest_tokens(word, review, drop = F) %>%
filter(!word %in% c("br")) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Filter reviews in the top 20% of sadness scores
top_sad <- final_scores %>%
filter(sadness_score > quantile(sadness_score, 0.8))
# Positive group: Sad but labeled as positive
sad_pos <- top_sad %>%
filter(sentiment == "positive") %>%
mutate(group = "sad_positive")
# Negative group: Sad and labeled as negative
sad_neg <- top_sad %>%
filter(sentiment == "negative") %>%
mutate(group = "sad_negative")
# Combine both groups
sad_groups <- bind_rows(sad_pos, sad_neg) %>%
select(id, group)
# Join group labels to the tidy word-level data
tidy_sad <- tidy_reviews %>%
inner_join(sad_groups, by = "id")
## log odds ratio
# Count word frequency per group
word_counts <- tidy_sad %>%
count(group, word)
# Spread into wide format for LOR calculation
word_wide <- word_counts %>%
pivot_wider(names_from = group, values_from = n, values_fill = 0)
# Add 1 to avoid zero counts (Laplace smoothing)
word_wide <- word_wide %>%
mutate(sad_positive = sad_positive + 1,
sad_negative = sad_negative + 1)
# Total word counts per group
total_pos <- sum(word_wide$sad_positive)
total_neg <- sum(word_wide$sad_negative)
# Compute Log Odds Ratio
lor <- word_wide %>%
mutate(
pos_prob = sad_positive / total_pos,
neg_prob = sad_negative / total_neg,
log_odds_ratio = log(pos_prob / neg_prob),
group = case_when(
log_odds_ratio > 0 ~ "sad_positive",
log_odds_ratio < 0 ~ "sad_negative"
)
)
# Select top words with highest absolute log odds ratio
top_lor <- lor %>%
arrange(desc(abs(log_odds_ratio))) %>%
slice_max(order_by = abs(log_odds_ratio), n = 15)
# Plot
ggplot(top_lor, aes(x = reorder(word, log_odds_ratio), y = log_odds_ratio, fill = group)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Top Words by Log Odds Ratio (Sad-Positive vs Sad-Negative)",
x = "Word", y = "Log Odds Ratio") +
theme_minimal()

ggsave("images/logoddsratio.png")
## Saving 7 x 5 in image
## n-gram
# Extract bigrams
sad_bigrams <- movieDB %>%
mutate(id = row_number()) %>%
inner_join(sad_groups, by = "id") %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2)
bigrams_separated <- sad_bigrams %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
str_detect(word1, "[a-z]"),
str_detect(word2, "[a-z]"),
word1 != "br", word2 != "br")
# Count bigrams per group
bigram_graph_data <- bigrams_separated %>%
count(group, word1, word2, sort = TRUE) %>%
filter(n >= 20) # threshold 조절 가능
# Draw graph for positive group
graph_df <- bigram_graph_data %>%
filter(group == "sad_positive") %>%
select(word1, word2, n)
graph <- graph_from_data_frame(graph_df)
set.seed(123)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Bigram Network for Sad-Positive Reviews") +
theme_void()

ggsave("images/bigram.png")
## Saving 7 x 5 in image
# Draw graph for negative group
graph_df <- bigram_graph_data %>%
filter(group == "sad_negative") %>%
select(word1, word2, n)
graph <- graph_from_data_frame(graph_df)
set.seed(123)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Bigram Network for Sad-Positive Reviews") +
theme_void()

Executive summary
What is (are) your main question(s)? What is your story? What does
the final graphic show?
Main question:
Can sadness in movie reviews be considered inherently negative? Or
can sadness coexist with positive sentiment, reflecting emotional
complexity such as nostalgia, beauty, or inspiration?
Story:
Using IMDB reviews and NRC emotion lexicon, we first identified
reviews with high sadness scores. We then divided them based on their
sentiment labels (positive vs negative) and explored how sadness
manifests differently depending on the overall sentiment orientation.
Through a combination of tf-idf, log odds ratio, bigram networks, and
word co-occurrence analysis, we investigated whether sadness always
aligns with negativity.
Final takeaway:
Our findings show that sadness is not inherently negative. Reviews
with high sadness scores but positive labels often express admiration,
emotional depth, or appreciation for tragic beauty. In contrast,
sad-negative reviews are more likely to focus on disappointment, poor
production, or loss. The final visuals clearly highlight this
contrast—both through statistically significant word associations and
distinct bigram patterns—demonstrating that sadness can be part of a
rich emotional landscape rather than a strictly negative signal.
Data background
Explain where the data came from, what agency or company made it,
how it is structured, what it shows, etc.
The data used in this project comes from the IMDB Large Movie Review
Dataset, originally developed by researchers at Stanford University for
sentiment classification tasks. It contains 50,000 English-language
movie reviews that have been pre-labeled as either “positive” or
“negative” based on user ratings.
Each review is provided as plain text and associated with a binary
sentiment label. The dataset is balanced, with an equal number of
positive and negative reviews, and is commonly used in natural language
processing (NLP) and machine learning tasks involving sentiment
analysis.
For this project, we used the full review texts and their associated
sentiment labels. We performed text preprocessing, tokenization, and
emotion tagging using the NRC Emotion Lexicon. The dataset served as the
foundation for all analyses, allowing us to explore how specific
emotions—particularly sadness—are expressed across different sentiment
categories.
Data loading, cleaning and preprocessing
Describe and show how you cleaned and reshaped the data
We began by importing the IMDB dataset as a .csv file containing
50,000 movie reviews, each labeled as either “positive” or “negative.”
Each review was assigned a unique ID for tracking during tokenization
and analysis.
To prepare the text data for analysis, we performed several
preprocessing steps:
HTML tag removal – Some reviews included HTML line breaks (
),
which were removed using regular expressions. Tokenization – Reviews
were tokenized into individual words using the unnest_tokens() function
from the tidytext package. Stopword removal – Common English stopwords
(e.g., “the”, “is”, “and”) were removed using the standard stop_words
list to retain only semantically meaningful terms. Emotion tagging –
Each token was joined with the NRC Emotion Lexicon to identify words
associated with specific emotions. In particular, we filtered for words
tagged with the “sadness” label to calculate a sadness score for each
review. Group assignment – Reviews in the top 20% of sadness scores were
retained for further analysis and divided into two groups based on their
sentiment label: sad_positive and sad_negative. This preprocessing
pipeline allowed us to reduce noise in the data and focus on emotionally
rich content, enabling precise comparisons between emotionally sad but
positive reviews versus sad and negative ones.
Text data analysis