movieDB <- read.csv("IMDB Dataset.csv")

tidy_reviews <- movieDB %>%
  mutate(id = row_number()) %>%
  unnest_tokens(word, review, drop = F) %>%
  filter(!word %in% c("br")) %>% 
  anti_join(stop_words)
## Joining with `by = join_by(word)`
## compare nrc score and bing score

load("nrc.rda")
nrc_sad <- nrc %>% filter(sentiment == "sadness")
bing <- get_sentiments("bing")

# calculate nrc sadness score
total_words <- tidy_reviews %>% count(id, name = "total_words")

sad_words <- tidy_reviews %>%
  inner_join(nrc_sad, by = "word") %>%
  count(id, name = "sad_words")

sad_score <- total_words %>%
  left_join(sad_words, by = "id") %>%
  mutate(sad_words = replace_na(sad_words, 0),
         sadness_score = sad_words / total_words)

# calculate bing score
bing_score <- tidy_reviews %>%
  select(-sentiment) %>% 
  inner_join(bing, by = "word") %>%
  count(id, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(bing_score = (positive - negative) / (positive + negative + 1))
## Warning in inner_join(., bing, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1212127 of `x` matches multiple rows in `y`.
## ℹ Row 5781 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# combine sentiments label
sentiments <- movieDB %>% mutate(id = row_number()) %>% select(id, sentiment)

final_scores <- sad_score %>%
  left_join(bing_score, by = "id") %>%
  left_join(sentiments, by = "id")

# visualize 
top_sad <- final_scores %>%
  filter(sadness_score > quantile(sadness_score, 0.8))

ggplot(top_sad, aes(x = bing_score, fill = sentiment)) +
  geom_histogram(bins = 40, position = "identity", alpha = 0.6, na.rm = TRUE) +
  labs(title = "Sentiment Score Distribution (Top 20% Sadness Score, Using Bing Lexicon)",
       x = "Bing Sentiment Score (Positive - Negative Ratio)",
       y = "Number of Reviews") +
  theme_minimal()

ggsave("images/nrc_vs_bing_sad_top20.png")
## Saving 7 x 5 in image
movieDB <- read.csv("IMDB Dataset.csv")

tidy_reviews <- movieDB %>%
  mutate(id = row_number()) %>%
  unnest_tokens(word, review, drop = F) %>%
  filter(!word %in% c("br")) %>% 
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Filter reviews in the top 20% of sadness scores
top_sad <- final_scores %>%
  filter(sadness_score > quantile(sadness_score, 0.8))

# Positive group: Sad but labeled as positive
sad_pos <- top_sad %>%
  filter(sentiment == "positive") %>%
  mutate(group = "sad_positive")

# Negative group: Sad and labeled as negative
sad_neg <- top_sad %>%
  filter(sentiment == "negative") %>%
  mutate(group = "sad_negative")

# Combine both groups
sad_groups <- bind_rows(sad_pos, sad_neg) %>%
  select(id, group)

# Join group labels to the tidy word-level data
tidy_sad <- tidy_reviews %>%
  inner_join(sad_groups, by = "id")
## log odds ratio

# Count word frequency per group
word_counts <- tidy_sad %>%
  count(group, word)

# Spread into wide format for LOR calculation
word_wide <- word_counts %>%
  pivot_wider(names_from = group, values_from = n, values_fill = 0)

# Add 1 to avoid zero counts (Laplace smoothing)
word_wide <- word_wide %>%
  mutate(sad_positive = sad_positive + 1,
         sad_negative = sad_negative + 1)

# Total word counts per group
total_pos <- sum(word_wide$sad_positive)
total_neg <- sum(word_wide$sad_negative)

# Compute Log Odds Ratio
lor <- word_wide %>%
  mutate(
    pos_prob = sad_positive / total_pos,
    neg_prob = sad_negative / total_neg,
    log_odds_ratio = log(pos_prob / neg_prob),
    group = case_when(
      log_odds_ratio > 0 ~ "sad_positive",
      log_odds_ratio < 0 ~ "sad_negative"
    )
  )

# Select top words with highest absolute log odds ratio
top_lor <- lor %>%
  arrange(desc(abs(log_odds_ratio))) %>%
  slice_max(order_by = abs(log_odds_ratio), n = 15)

# Plot
ggplot(top_lor, aes(x = reorder(word, log_odds_ratio), y = log_odds_ratio, fill = group)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top Words by Log Odds Ratio (Sad-Positive vs Sad-Negative)",
       x = "Word", y = "Log Odds Ratio") +
  theme_minimal()

ggsave("images/logoddsratio.png")
## Saving 7 x 5 in image
## n-gram

# Extract bigrams
sad_bigrams <- movieDB %>%
  mutate(id = row_number()) %>%
  inner_join(sad_groups, by = "id") %>%
  unnest_tokens(bigram, review, token = "ngrams", n = 2)

bigrams_separated <- sad_bigrams %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         str_detect(word1, "[a-z]"),
         str_detect(word2, "[a-z]"),
         word1 != "br", word2 != "br")

# Count bigrams per group
bigram_graph_data <- bigrams_separated %>%
  count(group, word1, word2, sort = TRUE) %>%
  filter(n >= 20)  # threshold 조절 가능

# Draw graph for positive group
graph_df <- bigram_graph_data %>%
  filter(group == "sad_positive") %>%
  select(word1, word2, n)

graph <- graph_from_data_frame(graph_df)

set.seed(123)
ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
  labs(title = "Bigram Network for Sad-Positive Reviews") +
  theme_void()

ggsave("images/bigram.png")
## Saving 7 x 5 in image
# Draw graph for negative group
graph_df <- bigram_graph_data %>%
  filter(group == "sad_negative") %>%
  select(word1, word2, n)

graph <- graph_from_data_frame(graph_df)

set.seed(123)
ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
  labs(title = "Bigram Network for Sad-Positive Reviews") +
  theme_void()

Executive summary

What is (are) your main question(s)? What is your story? What does the final graphic show?

Main question:

Can sadness in movie reviews be considered inherently negative? Or can sadness coexist with positive sentiment, reflecting emotional complexity such as nostalgia, beauty, or inspiration?

Story:

Using IMDB reviews and NRC emotion lexicon, we first identified reviews with high sadness scores. We then divided them based on their sentiment labels (positive vs negative) and explored how sadness manifests differently depending on the overall sentiment orientation. Through a combination of tf-idf, log odds ratio, bigram networks, and word co-occurrence analysis, we investigated whether sadness always aligns with negativity.

Final takeaway:

Our findings show that sadness is not inherently negative. Reviews with high sadness scores but positive labels often express admiration, emotional depth, or appreciation for tragic beauty. In contrast, sad-negative reviews are more likely to focus on disappointment, poor production, or loss. The final visuals clearly highlight this contrast—both through statistically significant word associations and distinct bigram patterns—demonstrating that sadness can be part of a rich emotional landscape rather than a strictly negative signal.

Data background

Explain where the data came from, what agency or company made it, how it is structured, what it shows, etc.

The data used in this project comes from the IMDB Large Movie Review Dataset, originally developed by researchers at Stanford University for sentiment classification tasks. It contains 50,000 English-language movie reviews that have been pre-labeled as either “positive” or “negative” based on user ratings.

Each review is provided as plain text and associated with a binary sentiment label. The dataset is balanced, with an equal number of positive and negative reviews, and is commonly used in natural language processing (NLP) and machine learning tasks involving sentiment analysis.

For this project, we used the full review texts and their associated sentiment labels. We performed text preprocessing, tokenization, and emotion tagging using the NRC Emotion Lexicon. The dataset served as the foundation for all analyses, allowing us to explore how specific emotions—particularly sadness—are expressed across different sentiment categories.

Data loading, cleaning and preprocessing

Describe and show how you cleaned and reshaped the data

We began by importing the IMDB dataset as a .csv file containing 50,000 movie reviews, each labeled as either “positive” or “negative.” Each review was assigned a unique ID for tracking during tokenization and analysis.

To prepare the text data for analysis, we performed several preprocessing steps:

HTML tag removal – Some reviews included HTML line breaks (
), which were removed using regular expressions. Tokenization – Reviews were tokenized into individual words using the unnest_tokens() function from the tidytext package. Stopword removal – Common English stopwords (e.g., “the”, “is”, “and”) were removed using the standard stop_words list to retain only semantically meaningful terms. Emotion tagging – Each token was joined with the NRC Emotion Lexicon to identify words associated with specific emotions. In particular, we filtered for words tagged with the “sadness” label to calculate a sadness score for each review. Group assignment – Reviews in the top 20% of sadness scores were retained for further analysis and divided into two groups based on their sentiment label: sad_positive and sad_negative. This preprocessing pipeline allowed us to reduce noise in the data and focus on emotionally rich content, enabling precise comparisons between emotionally sad but positive reviews versus sad and negative ones.

Text data analysis

Individual analysis and figures

Anaysis and Figure 1

Describe and show how you created the first figure. Why did you choose this figure type?

Comparing NRC and Bing Sentiment Scores

In our first figure, we compared two different lexicon-based sentiment scoring approaches: the NRC Emotion Lexicon and the Bing Liu Opinion Lexicon. Specifically, we examined reviews with high sadness scores (top 20% based on NRC) and plotted their Bing sentiment scores to explore how sadness correlates with overall polarity.

The NRC lexicon tags words with multiple emotions, such as “sadness,” “joy,” or “anger.” Using this, we calculated a sadness score for each review by computing the proportion of sadness-related words among all words in the review. On the other hand, the Bing lexicon assigns each word a binary polarity—either “positive” or “negative.” We calculated a Bing sentiment score for each review as the normalized difference between the counts of positive and negative words.

We then visualized the distribution of Bing scores for all high-sadness reviews using a histogram, color-coded by the original IMDB sentiment label (positive or negative). This figure was chosen because a histogram clearly shows the spread and overlap of sentiment scores across two groups, making it easy to observe patterns and potential contradictions.

What the figure shows: Despite all reviews having high sadness scores, their Bing sentiment scores vary significantly. A noticeable number of reviews labeled as “positive” exhibit negative or neutral Bing scores, and vice versa. This suggests that sadness is not inherently aligned with negative polarity, and can coexist with either positive or negative sentiment, depending on emotional context.

This figure supports our central question: Can sadness be positive? According to both lexicons, the answer appears to be yes—sadness is emotionally nuanced, not binary.

Anaysis and Figure 2

Word Usage Differences via Log Odds Ratio

For our second figure, we performed a lexicon-independent statistical comparison of word usage in sad reviews. Specifically, we computed the log odds ratio to identify which words were disproportionately associated with each sentiment label (positive or negative) within the subset of high-sadness reviews.

We divided the top 20% most “sad” reviews—based on NRC sadness scores—into two groups:

sad_positive: Reviews labeled as “positive” by IMDB sad_negative: Reviews labeled as “negative” We then calculated the log odds ratio of each word’s frequency across the two groups. This method highlights distinctive vocabulary by accounting for both absolute frequency and relative imbalance, offering more insight than raw word counts.

We visualized the top words with the highest absolute log odds ratios, color-coded by group. Bar lengths indicate the strength of association, while direction shows polarity within sadness.

Why this figure matters: Unlike lexicon-based sentiment scores, this approach reveals organically emergent vocabulary patterns. For instance:

sad_positive reviews are characterized by words like “beautiful,” “heartwarming,” “touching,” suggesting themes of poignant beauty or emotional catharsis. sad_negative reviews lean toward “pain,” “loss,” “waste,” “depressed,” indicating a tone of despair or frustration. This figure shows that although all reviews are emotionally “sad,” the narrative framing of that sadness—uplifting vs hopeless—can diverge sharply, depending on context.

It supports our core argument that sadness is not uniformly negative and must be interpreted in tandem with how it is expressed linguistically.

Anaysis and Figure 3

Exploring Emotional Language via Bigram Networks

To further investigate how sadness is expressed in different emotional contexts, we constructed bigram networks for the sad_positive and sad_negative groups. These visualizations show the most common two-word combinations (bigrams) in each group, after removing stopwords and irrelevant tokens.

Each network node represents a word, and edges connect words that frequently appear together as bigrams. The edge thickness corresponds to the frequency of the bigram. This method allows us to explore narrative structure and emotional framing beyond individual words.

Why this figure is important: While earlier analyses identified which words were most common or distinctive, the bigram network reveals how these words are combined to construct emotional meaning.

In the sad_positive network, we observe clusters like: “beautiful story” “heart warming” “made cry” These suggest a tone of emotional resonance, where sadness is linked with beauty, nostalgia, or meaningful impact. In the sad_negative network, common phrases include: “waste time” “poor acting” “fell asleep” Indicating frustration, boredom, or disappointment. Although both sets of reviews were high in sadness, the linguistic patterns diverge sharply in emotional tone and narrative function. The positive group frames sadness as touching or cathartic, while the negative group frames it as exhausting or disappointing.

Analytical value: This network-based n-gram analysis supports the idea that sentiment polarity within sadness is reflected in how sadness is verbalized. It provides rich contextual cues that simpler bag-of-words models would miss.

In showing the figures that you created, describe why you designed it the way you did. Why did you choose those colors, fonts, and other design elements? Does it convey truth?

In designing all figures in this report, careful attention was paid to visual clarity and emotional resonance. We chose each element—color, font, layout—not only for aesthetic appeal, but to enhance interpretability and support the core story.

Color palette:

Blue tones were used consistently across visualizations to represent sadness, aligning with its common emotional and cultural association. In comparison charts (e.g., log odds ratio and bigram plots), we assigned deeper or warmer hues (e.g., teal for sad_positive, navy for sad_negative) to distinguish sentiment while maintaining thematic coherence.

Font and layout:

We used clean, sans-serif fonts to ensure readability and modern aesthetic. Titles and axis labels were written in sentence case rather than all caps to maintain a conversational, narrative tone. Gridlines were kept minimal to avoid clutter while still helping guide the eye.

Figure types:

Bar plots were chosen for comparisons involving word frequencies and ratios, as they clearly show relative differences. Network graphs were used to highlight relationships between words, emphasizing structure and flow in emotional language.

Does it convey truth?

I am pretty sure—each design choice supports transparency and interpretability. We avoided excessive decoration or misleading encodings (e.g., 3D charts, ambiguous scales). Our goal was to make the emotional patterns in language feel intuitive without oversimplifying them.

By blending visual logic with emotional tone, the figures not only present data, but also evoke the nuanced relationship between sadness and sentiment.