library(readr)
library(dplyr)
library(ggplot2)
library(tidytext)
library(wordcloud)
library(RColorBrewer)
library(knitr)
library(stringr)
library(syuzhet)

imdb_data <- read_csv("IMDB Dataset.csv")

imdb_data <- imdb_data %>%
  mutate(review_id = row_number())

tidy_reviews <- imdb_data %>%
  unnest_tokens(word, review) %>%
  mutate(sentiment = imdb_data$sentiment[row_number()])

data("stop_words")
tidy_reviews <- tidy_reviews %>%
  anti_join(stop_words, by = "word")

word_count <- tidy_reviews %>%
  count(word, sort = TRUE)
head(word_count, 10)
## # A tibble: 10 × 2
##    word            n
##    <chr>       <int>
##  1 br         201951
##  2 movie       86953
##  3 film        77608
##  4 time        25030
##  5 story       22990
##  6 bad         18448
##  7 people      17850
##  8 movies      15262
##  9 characters  14429
## 10 watch       13919
# Sentiment Distribution Plot
sentiment_distribution <- imdb_data %>%
  count(sentiment) %>%
  ggplot(aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribution of Sentiments in IMDB Reviews", x = "Sentiment", y = "Count") +
  theme_minimal() +
  ylim(0, 30000) 

print(sentiment_distribution)

ggsave("images/sentiment_distribution.png", plot = sentiment_distribution, width = 10, height = 6)

# Add a column for review length (based on word count)
imdb_data <- imdb_data %>%
  mutate(review_length = str_count(review, "\\s+") + 1)

# Review Length Distribution Plot
length_distribution <- imdb_data %>%
  ggplot(aes(x = review_length, fill = sentiment)) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
  labs(title = "Distribution of Review Lengths by Sentiment", x = "Review Length (words)", y = "Frequency") +
  facet_wrap(~sentiment, scales = "free_y") +
  theme_minimal()

print(length_distribution)

# Boxplot of Review Lengths by Sentiment
boxplot_distribution <- imdb_data %>%
  ggplot(aes(x = sentiment, y = review_length, fill = sentiment)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
  labs(title = "Boxplot of Review Lengths by Sentiment",
       x = "Sentiment",
       y = "Review Length (words)") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

print(boxplot_distribution)

# Define positive and negative reviews
positive_reviews <- tidy_reviews %>%
  filter(sentiment == "positive")

negative_reviews <- tidy_reviews %>%
  filter(sentiment == "negative")

# Wordclouds for positive and negative reviews
positive_word_count <- positive_reviews %>%
  count(word, sort = TRUE) %>%
  filter(n > 5) 

negative_word_count <- negative_reviews %>%
  count(word, sort = TRUE) %>%
  filter(n > 5) 

# Set up plotting area
par(mfrow=c(1, 2), mar=c(0,0,0,0), bg = "black")

# Wordcloud for positive reviews
wordcloud(words = positive_word_count$word, freq = positive_word_count$n,
          min.freq = 5, max.words = 150, random.order = FALSE,
          rot.per = 0.25, scale = c(3, 0.5),
          colors = brewer.pal(8, "Set1"))

# Wordcloud for negative reviews
wordcloud(words = negative_word_count$word, freq = negative_word_count$n,
          min.freq = 5, max.words = 150, random.order = FALSE,
          rot.per = 0.25, scale = c(3, 0.5),
          colors = brewer.pal(8, "Set2"))

# Reset plotting area
par(mfrow=c(1, 1), mar=c(5, 4, 4, 2) + 0.1)

# Calculate sentiment scores
imdb_data <- imdb_data %>%
  mutate(sentiment_score = get_sentiment(review, method = "afinn"))

# Scatterplot of Sentiment Score vs. Review Length
scatter_plot <- ggplot(imdb_data, aes(x = review_length, y = sentiment_score, color = sentiment)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatterplot of Sentiment Score vs. Review Length",
       x = "Review Length (words)",
       y = "Sentiment Score") +
  theme_minimal()

print(scatter_plot)

# Basic statistics
stats_summary <- imdb_data %>%
  group_by(sentiment) %>%
  summarise(
    mean_length = mean(review_length),
    median_length = median(review_length),
    sd_length = sd(review_length),
    iqr_length = IQR(review_length)
  )

print(stats_summary)
## # A tibble: 2 × 5
##   sentiment mean_length median_length sd_length iqr_length
##   <chr>           <dbl>         <dbl>     <dbl>      <dbl>
## 1 negative         229.           174      165.        150
## 2 positive         233.           172      177.        159

Executive summary

Exploring the Relationship Between Review Length, Sentiment Distribution, and Sentiment Intensity in IMDB Movie Reviews. Q1:Is there a correlation between the length of reviews and their sentiment? Q2:What insights can be drawn about the intensity of sentiments expressed in reviews based on the calculated sentiment scores?

The story revolves around analyzing viewer sentiment in IMDB reviews to uncover underlying patterns in how people express their feelings about movies.It begins with a broad overview of sentiment distribution, providing a snapshot of general audience reactions.The analysis then delves deeper into the text of the reviews, highlighting specific words that frequently appear in either positive or negative contexts, thereby illustrating the common language used to express satisfaction or dissatisfaction. Further, the narrative explores whether the length of reviews correlates with sentiment, suggesting that more detailed reviews might carry stronger emotional weight.This aspect of the story is enriched by examining the relationship between review lengths and computed sentiment scores, offering insights into how the depth and intensity of sentiments are conveyed through longer or more detailed textual content.

What does the final graphic show? This final graphic ties together the primary research questions, highlighting the relationship between review length and sentiment, and providing a visual representation of sentiment intensity across different review lengths.

Data background

The IMDB dataset used in this analysis was obtained from Kaggle. The specific dataset can be found at IMDB Dataset of 50K Movie Reviews, and it was created by Lakshmi Nair. The dataset provides a comprehensive collection of 50,000 movie reviews, each labeled with a sentiment (positive or negative), making it ideal for sentiment analysis. It offers a balanced distribution of sentiments, allowing for robust training and evaluation of text classification models.

The IMDB dataset used in this analysis contains movie reviews and their associated sentiments. The dataset includes two main variables: - review: The text of the movie review. - sentiment: The sentiment label for the review, which can be either “positive” or “negative”.

Data loading, cleaning and preprocessing

Data Loading and tokenization first. Stop Words Handling:Initially, I considered removing frequently appearing but less informative words like “br”, “film”, and “movie”.However, recognizing that these terms reflect common language patterns in movie reviews, I decided to retain them.This approach preserves the natural language used by reviewers, maintaining the integrity of sentiment analysis. After cleaning and reshaping the data, I organized the visualizations in a logical sequence to effectively tell the story. This structured approach ensures a clear narrative flow, starting from a broad sentiment overview, moving to an analysis of review characteristics, and concluding with an exploration of the Relationship Between Review Length, Sentiment Distribution, and Sentiment Intensity.

Text data analysis

Conclusions 1.Correlation between Review Length and Sentiment: - The scatter plot and distribution analyses suggest that there is no strong linear correlation between the length of reviews and their sentiment. Both short and long reviews display a wide range of sentiment scores. This indicates that the length of a review does not necessarily predict whether the sentiment will be positive or negative.

2.Insights on Sentiment Intensity: - The boxplot and sentiment distribution graphs reveal that both negative and positive sentiments can exhibit extreme values, but positive reviews might show a slightly wider range of sentiment scores, indicated by the presence of more outliers on the higher end. This could imply that reviewers tend to express stronger sentiments when they feel positively about a movie. - The bar graph and length distributions further demonstrate that while the amount of text does not necessarily correlate with sentiment polarity, the intensity of sentiments (how strongly positive or negative a review is) varies significantly among reviews, regardless of their length.

Individual analysis and figures

Anaysis and Figure 1

Distribution of Sentiments in IMDB Reviews:This bar chart shows the distribution of sentiments (positive and negative) in the IMDB reviews dataset. Although the bar chart may seem less useful here because the dataset has an equal number of positive and negative reviews to better control variables, this step is essential for understanding the overall structure of the dataset.

Anaysis and Figure 2

Distribution of Review Lengths by Sentiment:This histogram illustrates the distribution of review lengths (in words) for both positive and negative sentiments.Histograms are ideal for showing the frequency distribution of continuous data.Using different colors for sentiments and faceting by sentiment helps in comparing the distributions effectively. - The histograms for “negative” and “positive” categories indicate the length of the text associated with each sentiment. - Both distributions are positively skewed, indicating most texts are shorter, but there are texts which are significantly longer. - Negative texts seem to have a wider distribution of lengths compared to positive ones, suggesting variability in the length when expressing negative sentiments.

Anaysis and Figure 3

Boxplot of Review Lengths by Sentiment:This boxplot shows the spread and central tendency of review lengths for each sentiment category. Boxplots highlight the median, variability, and potential outliers in review lengths for each sentiment. - This shows the spread and distribution of sentiment scores categorized into “negative” and “positive” sentiments. - The negative sentiment scores are mostly concentrated towards the lower range with a few outliers, indicating mostly uniform negative sentiments with some exceptionally low scores. - The positive sentiment scores show a wider range but also include several outliers on the higher end, suggesting variability in the intensity of positive sentiments.

Anaysis and Figure 4

The word cloud highlights the most frequent terms in the dataset. - Larger words such as “movie,” “film,” and “story” suggest that the dataset may be related to movie reviews. - A mix of positive terms (e.g., “excellent,” “wonderful”) and negative terms (e.g., “bad,” “awful”) are evident, providing insight into the common themes discussed in the texts. Changing the background to black in the word cloud will make the colors of the words pop more, enhancing visual contrast and making it easier to distinguish between the most and least frequent terms, thereby improving readability and focus on key themes in the dataset.

Anaysis and Figure 5

A scatter plot is used here to visualize the potential correlation between two continuous variables: review length and sentiment score. This type of plot helps to identify patterns, trends, and possible outliers, providing insights into whether longer reviews tend to have different sentiment scores compared to shorter ones.
-The scatter plot indicates no strong linear correlation between review length and sentiment scores, as sentiment expressed in reviews does not consistently vary with their length. -Data points are predominantly clustered at shorter review lengths, suggesting that shorter reviews are more common, with both positive and negative sentiments distributed across all review lengths. -The presence of outliers and a wide dispersion of sentiment scores at various lengths highlight a complex relationship that may require advanced analytical techniques to fully understand underlying patterns.