NYT Movie Reviews Sentiment Analysis

library(tidyverse)
library(httr2)
library(jsonlite)
library(lubridate)
library(stringr)
library(purrr)
library(tidyr)
library(tibble)
library(tidytext)  # Added for sentiment analysis


readRenviron("projap")
nyt_key <- Sys.getenv("NYT_API_KEY")
stopifnot(nchar(nyt_key) > 0)

`%or%` <- function(x, y) if (is.null(x) || identical(x, "")) y else x

# Article Search API endpoint
ARTSEARCH_URL <- "https://api.nytimes.com/svc/search/v2/articlesearch.json"

fetch_movie_reviews_via_articlesearch <- function(from, to, max_pages = 100L, sleep_sec = 6) {
  from <- as.Date(from); to <- as.Date(to)
  stopifnot(!is.na(from), !is.na(to), from <= to)
  
  key <- trimws(Sys.getenv("NYT_API_KEY"))
  stopifnot(nzchar(key))
  
  # Use keyword search instead of restrictive filters (which failed)
  begin_date <- format(from, "%Y%m%d")
  end_date   <- format(to,   "%Y%m%d")
  
  out <- list()
  
  for (page in 0:(max_pages - 1L)) {
    message(sprintf("Fetching page %d...", page))
    
    req <- httr2::request(ARTSEARCH_URL) |>
      httr2::req_url_query(
        `api-key`  = key,
        q          = "movie review OR film review",  # Keyword search
        begin_date = begin_date,
        end_date   = end_date,
        page       = page
      ) |>
      httr2::req_user_agent("MSDS/nyt-articlesearch (httr2)")
    
    resp <- httr2::req_perform(req)
    
    if (httr2::resp_status(resp) != 200) {
      msg <- tryCatch(httr2::resp_body_string(resp), error = \(e) "<no body>")
      stop(sprintf("ArticleSearch failed [%s]\nBody: %s", httr2::resp_status(resp), msg))
    }
    
    js   <- httr2::resp_body_json(resp, simplifyVector = FALSE)
    docs <- js$response$docs
    hits <- js$response$meta$hits %||% 0L
    
    if (!length(docs)) break
    
    # Filter to only Reviews
    review_docs <- Filter(function(d) {
      identical(d$type_of_material, "Review")
    }, docs)
    
    if (length(review_docs) > 0) {
      out[[length(out) + 1L]] <- purrr::map_dfr(review_docs, function(d) {
        tibble::tibble(
          id               = d[["_id"]] %or% NA_character_,
          pub_date         = lubridate::ymd_hms(d$pub_date %or% NA_character_, quiet = TRUE),
          headline         = d$headline$main %or% NA_character_,
          byline           = d$byline$original %or% NA_character_,
          summary_short    = d$abstract %or% d$snippet %or% d$lead_paragraph %or% NA_character_,
          web_url          = d$web_url %or% NA_character_,
          section_name     = d$section_name %or% NA_character_,
          type_of_material = d$type_of_material %or% NA_character_,
          news_desk        = d$news_desk %or% NA_character_
        )
      })
    }
    
    # stop if we've reached or exceeded lim
    if ((page + 1L) * 10L >= hits) break
    
    Sys.sleep(sleep_sec)
  }
  
  res <- dplyr::bind_rows(out)
  if (!nrow(res)) return(tibble::tibble())
  
  # Create consistent column names
  res |>
    dplyr::mutate(
      display_title    = headline,
      publication_date = as.Date(pub_date),
      link_url         = web_url
    ) |>
    dplyr::distinct(display_title, publication_date, .keep_all = TRUE) |>
    dplyr::select(display_title, byline, headline, summary_short,
                  publication_date, link_url, id, pub_date,
                  section_name, type_of_material, news_desk) |>
    dplyr::arrange(publication_date, display_title)
}

# Fetch movie reviews for 2022 using Article Search API
reviews_2022 <- fetch_movie_reviews_via_articlesearch(
  from = "2022-01-01",
  to = "2022-12-31",
  max_pages = 10,
  sleep_sec = 15  # Article Search API rate limit: 10 requests per minute
)

## Fetching page 0...

## Fetching page 1...

## Fetching page 2...

## Fetching page 3...

## Fetching page 4...

## Fetching page 5...

## Fetching page 6...

## Fetching page 7...

## Fetching page 8...

## Fetching page 9...

# Preview 
glimpse(reviews_2022)

## Rows: 69
## Columns: 11
## $ display_title    <chr> "‘The 2022 Oscar Nominated Short Films’ Review: Small…
## $ byline           <chr> "By Jeannette Catsoulis, Amy Nicholson and Ben Kenigs…
## $ headline         <chr> "‘The 2022 Oscar Nominated Short Films’ Review: Small…
## $ summary_short    <chr> "From near-future nightmares to inspirational sports …
## $ publication_date <date> 2022-02-24, 2022-02-24, 2022-03-17, 2022-03-24, 2022…
## $ link_url         <chr> "https://www.nytimes.com/2022/02/24/movies/oscar-nomi…
## $ id               <chr> "nyt://article/4b738df9-c872-5da2-9cbf-f5ee5946e779",…
## $ pub_date         <dttm> 2022-02-24 17:10:57, 2022-02-24 05:45:02, 2022-03-17…
## $ section_name     <chr> "Movies", "Movies", "Movies", "Movies", "Movies", "Mo…
## $ type_of_material <chr> "Review", "Review", "Review", "Review", "Review", "Re…
## $ news_desk        <chr> "Weekend", "Weekend", "Weekend", "Weekend", "Weekend"…

cat("\nTotal reviews fetched:", nrow(reviews_2022), "\n")

## 
## Total reviews fetched: 69

# Words from the short summary
reviews_words_2022 <- reviews_2022 %>%
  filter(!is.na(summary_short), nchar(summary_short) > 0) %>%
  unnest_tokens(word, summary_short) %>%
  anti_join(stop_words, by = "word")

# Most common words
cat("\nMost common words in reviews:\n")

## 
## Most common words in reviews:

reviews_words_2022 %>% count(word, sort = TRUE) %>% head(20)

## # A tibble: 20 × 2
##    word            n
##    <chr>       <int>
##  1 film           18
##  2 documentary     9
##  3 movie           8
##  4 comedy          7
##  5 star            6
##  6 stars           5
##  7 drama           4
##  8 kevin           4
##  9 plays           4
## 10 series          4
## 11 animated        3
## 12 black           3
## 13 cinema          3
## 14 family          3
## 15 horror          3
## 16 killer          3
## 17 life            3
## 18 musical         3
## 19 power           3
## 20 real            3

# Sentiment analysis using Bing 
review_sums <- reviews_words_2022 %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(display_title, publication_date, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(
    sentiment_score = positive - negative,
    total_sentiment_words = positive + negative
  ) %>%
  left_join(
    reviews_2022 %>% select(display_title, publication_date, summary_short, byline),
    by = c("display_title", "publication_date")
  )

# Summary statistics
cat("\nSentiment Analysis Summary:\n")

## 
## Sentiment Analysis Summary:

cat("Total reviews analyzed:", nrow(review_sums), "\n")

## Total reviews analyzed: 52

cat("Mean sentiment score:", mean(review_sums$sentiment_score, na.rm = TRUE), "\n")

## Mean sentiment score: -0.2692308

cat("Median sentiment score:", median(review_sums$sentiment_score, na.rm = TRUE), "\n")

## Median sentiment score: -1

# Distribution plot
ggplot(review_sums, aes(x = sentiment_score)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  theme_minimal() +
  labs(
    title = "Distribution of Sentiment Scores for 2022 Movie Reviews",
    x = "Sentiment Score (Positive - Negative)",
    y = "Count"
  )

github–>https://github.com/lher96/MSDS-Assignments/blob/main/Sentiment%20Analysis