0) Project goal + research questions

We use the Horror Movies dataset (Kaggle) to answer three big themes with course-style visual storytelling:

Theme A — Evolution (time, sub-genres, runtime)

  1. RQ-A1: How did the number of horror releases change over time (and where are the “boom” years)?
  2. RQ-A2: Did horror become more “hybrid” over time (more co-genres per film)?
  3. RQ-A3: Which co-genres (Thriller, Mystery, Comedy…) rose and fell over decades?
  4. RQ-A4: Did runtime distributions shift across decades?
  5. RQ-A5: What does the co-genre ecosystem look like (as a network)?

Theme B — Economics (budget, revenue, sleeper hits)

  1. RQ-B1: How strongly are budget and revenue linked—and where do we find surprising hits (high ROI on low budgets)?
  2. RQ-B2: Is the “typical” budget (and revenue) trending upward over time?
  3. RQ-B3 Which films were the most profitable, and what were their co-genres?

Theme C — Audience reception (ratings vs budget/profitability/runtime)

  1. RQ-C1: Do higher budgets buy higher ratings?
  2. RQ-C2: Do more profitable movies get higher ratings?
  3. RQ-C3: Is there a runtime “sweet spot” for audience ratings?
  4. RQ-C4: Which co-genres are associated with higher ratings? (descriptive)

Notes: Financial fields often contain zeros. In this report, we treat 0 budget/revenue/runtime as missing.


1) Libraries + data import

library(tidyverse)
library(lubridate)
library(scales)
library(stringr)
library(tidyr)
library(dplyr)

# Advanced viz 
library(ggridges)   # ridge plots
library(ggExtra)    # marginal plots
library(plotly)     # interactive graphics
library(patchwork)  # arranging multiple plots
library(ggrepel)    # nicer labels/annotations
library(ggiraph)
library(DT)
library(ggplot2)
library(viridis)
setwd('/Users/majid/Documents/3-third semester/Advance R/RVisualization-Horror-Movie-main/data/raw/')
movies_raw <- readr::read_csv("horror_movies.csv", show_col_types = FALSE)
glimpse(movies_raw)
## Rows: 32,540
## Columns: 21
## $ ...1              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ id                <dbl> 760161, 760741, 882598, 756999, 772450, 1014226, 717…
## $ original_title    <chr> "Orphan: First Kill", "Beast", "Smile", "The Black P…
## $ title             <chr> "Orphan: First Kill", "Beast", "Smile", "The Black P…
## $ original_language <chr> "en", "en", "en", "en", "es", "es", "en", "en", "en"…
## $ overview          <chr> "After escaping from an Estonian psychiatric facilit…
## $ tagline           <chr> "There's always been something wrong with Esther.", …
## $ release_date      <date> 2022-07-27, 2022-08-11, 2022-09-23, 2022-06-22, 202…
## $ poster_path       <chr> "/pHkKbIRoCe7zIFvqan9LFSaQAde.jpg", "/xIGr7UHsKf0URW…
## $ popularity        <dbl> 5088.584, 2172.338, 1863.628, 1071.398, 1020.995, 93…
## $ vote_count        <dbl> 902, 584, 114, 2736, 83, 1, 125, 1684, 73, 1035, 637…
## $ vote_average      <dbl> 6.9, 7.1, 6.8, 7.9, 7.0, 1.0, 5.8, 7.0, 6.5, 6.8, 7.…
## $ budget            <dbl> 0, 0, 17000000, 18800000, 0, 0, 20000000, 68000000, …
## $ revenue           <dbl> 9572765, 56000000, 45000000, 161000000, 0, 0, 289259…
## $ runtime           <dbl> 99, 93, 115, 103, 0, 0, 88, 130, 90, 106, 98, 89, 97…
## $ status            <chr> "Released", "Released", "Released", "Released", "Rel…
## $ adult             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ backdrop_path     <chr> "/5GA3vV1aWWHTSDO5eno8V5zDo8r.jpg", "/2k9tBql5GYH328…
## $ genre_names       <chr> "Horror, Thriller", "Adventure, Drama, Horror", "Hor…
## $ collection        <dbl> 760193, NA, NA, NA, NA, NA, 94899, NA, NA, 950289, N…
## $ collection_name   <chr> "Orphan Collection", NA, NA, NA, NA, NA, "Jeepers Cr…

2) Cleaning + feature engineering

movies <- movies_raw %>%
  mutate(
    release_date = as.Date(release_date),
    year = year(release_date),
    decade = floor(year / 10) * 10,
    runtime = na_if(runtime, 0),
    budget = na_if(budget, 0),
    revenue = na_if(revenue, 0),
    vote_average = na_if(vote_average, 0),
    vote_count = na_if(vote_count, 0),
    # split genre_names like "Horror, Mystery, Thriller"
    genre_list = str_split(genre_names %||% "", ",\\s*"),
    n_genres = lengths(genre_list)
  ) %>%
  filter(!is.na(year), year >= 1950, year <= max(year, na.rm = TRUE))

# Helper dataset for economics / ratings analysis
movies_fin <- movies %>%
  filter(!is.na(budget), !is.na(revenue), budget > 0, revenue > 0) %>%
  mutate(
    profit = revenue - budget,
    roi = revenue / budget,
    log_budget = log10(budget),
    log_revenue = log10(revenue),
    profit_flag = case_when(
      profit < 0 ~ "Loss",
      roi < 2 ~ "Low profit",
      roi < 5 ~ "Hit",
      TRUE ~ "Sleeper/Blockbuster"
    ) %>% factor(levels = c("Loss","Low profit","Hit","Sleeper/Blockbuster"))
  )

summary(select(movies, year, runtime, vote_average, n_genres))
##       year         runtime        vote_average       n_genres     
##  Min.   :1950   Min.   :  1.00   Min.   : 0.500   Min.   : 1.000  
##  1st Qu.:2000   1st Qu.: 26.00   1st Qu.: 4.100   1st Qu.: 1.000  
##  Median :2012   Median : 83.00   Median : 5.200   Median : 2.000  
##  Mean   :2007   Mean   : 67.69   Mean   : 5.191   Mean   : 1.991  
##  3rd Qu.:2018   3rd Qu.: 92.00   3rd Qu.: 6.100   3rd Qu.: 3.000  
##  Max.   :2022   Max.   :683.00   Max.   :10.000   Max.   :16.000  
##                 NA's   :2668     NA's   :11629
summary(select(movies_fin, budget, revenue, profit, roi))
##      budget             revenue              profit          
##  Min.   :        1   Min.   :        1   Min.   :-194775779  
##  1st Qu.:  1000000   1st Qu.:   797276   1st Qu.:   -127461  
##  Median :  5375000   Median : 11009988   Median :   2729260  
##  Mean   : 12642663   Mean   : 38559968   Mean   :  25917305  
##  3rd Qu.: 15000000   3rd Qu.: 44975559   3rd Qu.:  29052546  
##  Max.   :200000000   Max.   :701842551   Max.   : 666842551  
##       roi          
##  Min.   :   0.000  
##  1st Qu.:   0.667  
##  Median :   2.013  
##  Mean   :  26.100  
##  3rd Qu.:   4.830  
##  Max.   :6666.667

Theme A — Evolution over time

A1) Releases per year (area chart + “boom years” annotations)

by_year <- movies %>%
  count(year, name = "n_movies")

# Find "boom years":top year in each decade
boom_years <- by_year %>%
  mutate(decade = floor(year / 10) * 10) %>%
  group_by(decade) %>%
  slice_max(n_movies, n = 1, with_ties = FALSE) %>%
  ungroup()

p_releases <- ggplot(by_year, aes(x = year, y = n_movies)) +
  geom_area(alpha = 0.7) +
  geom_smooth(se = FALSE, linewidth = 1) +
  geom_point(data = boom_years, size = 2) +
  ggrepel::geom_text_repel(
    data = boom_years,
    aes(label = paste0(year, ": ", comma(n_movies))),
    size = 3,
    max.overlaps = 20
  ) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "RQ-A1: Horror releases over time",
    subtitle = "Area chart with a smooth trend and the highest-release year in each decade",
    x = NULL, y = "Number of releases"
  ) +
  theme_minimal(base_size = 12)

p_releases

In this plot we count how many horror movies were released each year. The grey area shows the number of releases, and the blue smooth line shows the general trend.

What we see:

  • From the 1950s to the 1980s, the number of horror releases is low and slowly increasing. For example, one “boom year” in the 1950s is around 1958 (75 movies), which is small compared to later decades.
  • In the 1970s and 1980s, the numbers become more stable and higher, but still not very big. We see a boom year like 1972 (209) and later 1988 (413).
  • After 2000, the growth becomes much faster. The curve goes up strongly, and a boom year like 2009 (808) is already much bigger than all earlier decades.
  • The biggest change is after 2015. Releases increase very quickly, reaching very high values in the late 2010s and 2020.
  • The highest point is 2020 (~2,100 movies), and 2019 (~1,895) is also extremely high. This shows a clear “boom” period.

Overall, horror movies became more and more common over time, but the real explosion happens in the last 15–20 years. The genre looks like it is growing a lot, especially in the late 2010s and around 2020. If you see a small drop after 2020 in data, it can be because recent years may be incomplete in the dataset or because the movie industry changed during that time.

A2) Genre hybridization (how many genres per film over time?)

library(dplyr)
library(ggplot2)
library(scales)

hybrid <- movies %>%
  filter(!is.na(year), !is.na(n_genres)) %>%
  group_by(year) %>%
  summarise(
    avg_n_genres = mean(n_genres),
    sd_n_genres  = sd(n_genres),
    n = n(),
    se = sd_n_genres / sqrt(n),
    ci = 1.96 * se,
    lo = avg_n_genres - ci,
    hi = avg_n_genres + ci,
    .groups = "drop"
  )

p_hybrid <- ggplot(hybrid, aes(x = year, y = avg_n_genres)) +
  # uncertainty ribbon 
  geom_ribbon(aes(ymin = lo, ymax = hi), fill = "#4C78A8", alpha = 0.18) +

  # raw yearly line 
  geom_line(color = "grey25", alpha = 0.55, linewidth = 0.9) +
  geom_point(color = "grey25", alpha = 0.55, size = 1.2) +

  # smooth trend line 
  geom_smooth(method = "loess", se = FALSE, color = "#2F5597", linewidth = 1.6) +

  # film count 
  geom_line(aes(y = rescale(n, to = range(avg_n_genres, na.rm = TRUE))),
            color = "#F28E2B", linewidth = 0.9, alpha = 0.6) +

  annotate("text",
           x = min(hybrid$year, na.rm = TRUE) + 2,
           y = max(hybrid$avg_n_genres, na.rm = TRUE),
           label = "Orange line = relative film count",
           hjust = 0, vjust = 1.2, size = 3.6, color = "#F28E2B") +

  scale_y_continuous(
    breaks = pretty_breaks(),
    expand = expansion(mult = c(0.02, 0.06))
  ) +
  scale_x_continuous(breaks = pretty_breaks(6)) +
  labs(
    title = "RQ-A2: Is horror becoming more hybrid?",
    subtitle = "Average number of listed genres per film • ribbon = 95% CI (yearly mean)",
    x = NULL, y = "Avg. number of genres per film"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 12, color = "grey30"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

p_hybrid

Here we are checking genre hybridization, meaning: does a horror movie usually come with only “Horror”, or is it mixed with other genres like Thriller, Mystery, Comedy, Drama, etc.? The y-axis is the average number of genres listed per film each year.

  • Black line + dots = real yearly averages (can be noisy year-to-year).
  • Blue thick line = smooth trend (overall direction across decades).
  • Light gray ribbon = 95% confidence interval around the yearly mean (wider = more uncertainty, usually fewer films).
  • Orange line = relative film count (how many films you have each year, scaled so it can be shown on the same plot).

What we see

  • In the 1950s, the average sits around ~2.2–2.3 genres per film, but the gray ribbon is very wide. → This means early years are more uncertain (likely fewer movies in the dataset).

  • Through the 1960s–1970s, the average stays pretty stable (still around ~2.1–2.2). → Horror is commonly tagged as Horror + one more genre.

  • In the late 1970s to mid-1980s, the plot shows the highest hybridization (peaks close to ~2.4–2.5). → This is the period where horror films are most often labeled with multiple co-genres (e.g., Horror–Sci-Fi, Horror–Mystery, Horror–Thriller).

  • After the late 1980s / early 1990s, the blue trend starts declining.

  • From about 2000 onward, the average drops more clearly, reaching around ~1.9–2.0 by the late 2010s / early 2020s. → In the dataset, recent horror movies are tagged with fewer genres, so they look less hybrid overall.

  • Meanwhile, the orange line rises strongly after ~2005, meaning the dataset contains many more films in recent years. → So the decrease in genres per film happens even while film counts increase, which supports the idea that this is not just random noise.

Overall: based on this dataset, horror movies look most hybrid in the 1980s, and less hybrid in the last 20–25 years (fewer genres listed per movie). Just like you noted before, part of this trend could reflect changes in tagging practices (how platforms label genres over time), not only a true change in filmmaking.

A4) Runtime evolution (ridge plot by decade)

runtime_decade <- movies %>%
  filter(!is.na(runtime), runtime >= 40, runtime <= 220, decade >= 1960) %>%
  mutate(decade = factor(decade))

p_runtime <- ggplot(runtime_decade, aes(x = runtime, y = decade, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(scale = 2.2, rel_min_height = 0.01, size = 0.3) +
  scale_fill_viridis_c(option = "C", name = "Runtime") +
  labs(
    title = "RQ-A4: Runtime distributions across decades",
    subtitle = "Ridge plot (trimmed to 40–220 minutes to reduce outliers)",
    x = "Runtime (minutes)", y = "Decade"
  ) +
  theme_minimal(base_size = 12)

p_runtime

This ridge plot shows how horror movie runtimes are distributed in each decade (each “ridge” is one decade). The x-axis is runtime in minutes. The shape tells us where most movies are concentrated. (We trimmed the data to 40–220 minutes to remove extreme outliers.)

What we see

  • Most horror movies are around 85–100 minutes in almost every decade. The highest part of each ridge is usually near ~90 minutes, so this looks like the “classic” horror length.

  • In older decades (1960s–1970s), the ridge is a bit more spread out and sometimes has small bumps. This means runtime was less consistent and there were more movies that are shorter or longer compared to the main peak.

  • From the 1980s to 2000s, the distributions become more concentrated around 90 minutes. So horror becomes more standardized, like producers follow a common format.

  • In the 2010s and 2020s, the main peak is still around 90 minutes, but we also see a longer right tail (more movies above 110–130 minutes). That suggests recent decades include more long-form horror (often psychological / story-driven) besides the typical short runtime.

Simply,Across decades, horror movies usually stay close to about 90 minutes, so this is like the normal runtime of the genre. But in newer decades, we see more movies that are longer than before, meaning modern horror sometimes wants more time for story, tension, and slow build-up. Still, the majority remains around the same classic length. Because the dataset includes many movies in recent years, the ridge shapes for 2010s/2020s may be smoother and more detailed (more data points), while early decades can look more “bumpy” because there are fewer films recorded.

A5) What does the co-genre ecosystem look like (as a network)?

library(visNetwork)

# build co-genre pairs per movie (excluding Horror)
pairs_df <- movies %>%
  mutate(cogenres = purrr::map(genre_list, ~ sort(unique(setdiff(.x, "Horror"))))) %>%
  mutate(pairs = purrr::map(cogenres, ~ {
    if (length(.x) < 2) return(tibble::tibble(from=character(), to=character()))
    m <- t(combn(.x, 2))
    tibble::tibble(from = m[,1], to = m[,2])
  })) %>%
  select(id, pairs) %>%
  tidyr::unnest(pairs)

edges <- pairs_df %>%
  count(from, to, sort = TRUE, name = "weight") %>%
  filter(weight >= 120)  # tune threshold to avoid clutter

node_stats <- movies %>%
  filter(!is.na(vote_average), !is.na(vote_count)) %>%
  mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
  select(vote_average, vote_count, cogenres) %>%
  tidyr::unnest(cogenres) %>%
  filter(!is.na(cogenres), cogenres != "") %>%
  group_by(cogenres) %>%
  summarise(
    freq = n(),
    mean_rating = mean(vote_average, na.rm = TRUE),
    .groups = "drop"
  )

nodes <- tibble::tibble(id = unique(c(edges$from, edges$to))) %>%
  left_join(node_stats, by = c("id" = "cogenres")) %>%
  mutate(
    label = id,
    value = freq,
    title = paste0("<b>", id, "</b><br>Movies: ", freq, "<br>Mean rating: ", round(mean_rating,2))
  )

visNetwork(nodes = as.data.frame(nodes), edges = as.data.frame(edges)) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
  visEdges(smooth = FALSE) %>%
  visPhysics(stabilization = TRUE) %>%
  visInteraction(hover = TRUE)

Here we are checking the co-genre ecosystem as a network — meaning: which genres tend to appear together with Horror, and how are those co-genres connected to each other?

In this network, each node is a genre that co-occurs with Horror, and each edge (line) means two genres appear together in the same movie (alongside Horror).

  • Node size = how common that co-genre is (bigger = appears with Horror more often).
  • Lines (edges) = co-occurrence links (genres that frequently show up together in the same films).
  • Clusters / proximity = genres that often mix together form tight groups.

What we see

  • Thriller is the main hub (largest node, most connections). → This means Horror is most often combined with Thriller, and Thriller also connects strongly to many other genres (it acts like a “bridge” genre).

  • Around Thriller, we see other highly connected genres like Mystery, Drama, Comedy, and Science Fiction. → These genres appear often with Horror and also combine with each other, creating a dense “core” of hybrid horror.

  • The network has a clear core–periphery structure:

    • Core (dense center): Thriller, Mystery, Drama, Comedy, Sci-Fi, Action, Fantasy → lots of links, many hybrid combinations.
    • Periphery (outer nodes): Romance, Animation, TV Movie, Adventure, Crime → fewer links and/or less frequent co-occurrence.
  • Comedy and Drama connect into the core, showing that horror often branches into:

    • Horror–Comedy (camp, parody, dark comedy)
    • Horror–Drama (psychological / character-driven horror)
  • Science Fiction and Action also sit close to the core and connect broadly. → This suggests a strong “horror–sci-fi/action” lane (monsters, apocalyptic stories, creature features).

  • Some genres look more specialized:

    • Animation is more isolated (few links). → Horror animation exists, but it’s rarer and less mixed.
    • Romance also sits on the edge. → Horror–romance combos happen, but they’re not a dominant pattern.

Overall: the co-genre ecosystem shows a dense central cluster dominated by Thriller (the strongest partner of Horror), with Mystery/Drama/Comedy/Sci-Fi forming the main supporting mix. Outside this core are more niche pairings like Animation and Romance, which occur less often and connect less broadly.


Theme B — Economics

B1) Budget vs revenue (scatter + marginal distributions) + “sleeper hits”

fin <- movies_fin %>%
  filter(budget >= 1e5, revenue >= 1e5) %>%  # reduce extreme noise from tiny numbers
  mutate(
    sleeper = roi >= quantile(roi, 0.95, na.rm = TRUE) & budget <= quantile(budget, 0.40, na.rm = TRUE)
  )

p_scatter <- ggplot(fin, aes(x = budget, y = revenue)) +
  geom_point(aes(alpha = sleeper), size = 1.8) +
  scale_x_log10(labels = label_number(scale_cut = cut_short_scale())) +
  scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    title = "RQ-B1: Budget vs revenue (log scale) + sleeper hits",
    subtitle = "Sleeper hit = top 5% ROI AND bottom 40% budget (heuristic)",
    x = "Budget (log10)", y = "Revenue (log10)", alpha = "Sleeper hit"
  ) +
  theme_minimal(base_size = 12)

ggExtra::ggMarginal(p_scatter, type = "histogram", bins = 30)

This plot compares budget vs revenue for horror movies. Both axes are in log scale, so we can see small and big movies together (because budgets and revenues are very spread).

  • Each dot = one movie
  • X-axis = budget (log10) → more to the right = higher budget
  • Y-axis = revenue (log10) → higher = more revenue
  • The top and right histograms show the distributions of budget and revenue.
  • Black points = “sleeper hits” (our rule: top 5% ROI and bottom 40% budget).

What we see

2) Most horror movies are “mid budget” in the dataset

From the top histogram, the highest bars are around the middle budgets (not extremely small and not extremely huge). So most horror films here are not blockbuster-level, but also not extremely tiny.

3) Revenue distribution has a long tail

On the right histogram, we see many movies earn “normal” amounts, but a smaller number reach very high revenue (tens/hundreds of millions). So the genre has some very big winners, but they are not the majority.

4) Sleeper hits appear on the “left but high” area

The black dots are mostly left side (lower budgets) but high on y-axis (high revenue). This matches the idea of sleeper hits: they didn’t cost much, but they earned a lot, so their ROI becomes huge.

5) Key message

Horror is a genre where low-budget movies can still become very profitable. Compared to some other genres, horror can succeed with smaller budgets because it relies more on atmosphere, story, and suspense rather than expensive special effects. (This is a logical interpretation based on the pattern.)

Overall,This chart shows budget and revenue are connected, but not strongly. Many movies with medium or even low budget can still earn very high revenue. The black points (sleeper hits) prove that horror can give surprising success even with small budgets.”

Identify the “most surprising” wins (ROI leaders)

top_roi <- fin %>%
  arrange(desc(roi)) %>%
  select(title, year, budget, revenue, profit, roi, vote_average, vote_count) %>%
  slice_head(n = 15)

top_roi
## # A tibble: 15 × 8
##    title                year budget revenue profit   roi vote_average vote_count
##    <chr>               <dbl>  <dbl>   <dbl>  <dbl> <dbl>        <dbl>      <dbl>
##  1 Paranormal Activity  2007 2.15e5  1.94e8 1.94e8 903.           6         4309
##  2 Lady Frankenstein    1971 2   e5  1.40e8 1.39e8 698.           5           39
##  3 The Gallows          2015 1   e5  4.27e7 4.26e7 427.           5          841
##  4 Open Water           2004 1.3 e5  5.47e7 5.45e7 421.           5.5        905
##  5 Night of the Livin…  1968 1.14e5  3   e7 2.99e7 263.           7.6       1935
##  6 Halloween            1978 3.25e5  7.03e7 6.99e7 216.           7.6       4421
##  7 The Legend of Bogg…  1972 1.6 e5  2.20e7 2.18e7 138.           5           48
##  8 The Hills Have Eyes  1977 2.3 e5  2.5 e7 2.48e7 109.           6.2        693
##  9 Friday the 13th      1980 5.50e5  5.98e7 5.92e7 109.           6.4       2289
## 10 The Devil Inside     2012 1   e6  1.02e8 1.01e8 102.           4.7        678
## 11 Taxiwala             2018 1   e6  1   e8 9.9 e7 100            6.1         21
## 12 The Rocky Horror P…  1975 1.20e6  1.13e8 1.12e8  94.1          7.5       2326
## 13 The Quiet Ones       2014 2   e5  1.78e7 1.76e7  89.2          5.1        584
## 14 Saw                  2004 1.20e6  1.04e8 1.03e8  86.6          7.4       7650
## 15 Dawn of the Dead     1978 6.5 e5  5.50e7 5.44e7  84.6          7.5       1674

This table is showing the “most surprising wins” in horror, meaning movies with the highest ROI (Return on Investment).

What ROI means

  • ROI ≈ revenue / budget
  • So ROI = 100x means the movie made about 100 times its budget (very rare in most genres).

What we learn from this table

1) Horror can produce crazy high ROI with small budgets

Look at the top rows:

  • Paranormal Activity (2007): budget around $215k, revenue around $194M, ROI about 903x → This is a perfect example of a “small movie → huge success”.

  • The Gallows (2015): budget $100k, revenue $42.7M, ROI ~427x

  • Open Water (2004): budget $130k, revenue $54.7M, ROI ~421x

These are “surprising” because they didn’t need big money to become profitable.

2) Classic horror franchises also show strong ROI

Some famous titles appear too:

  • Halloween (1978): ROI ~216x
  • Friday the 13th (1980): ROI ~109x
  • Saw (2004): ROI ~86.6x

So even well-known horror classics were originally made cheaply compared to their revenue.

3) Ratings are not always super high, but profits are huge

Many of these movies have ratings around 5–7, not necessarily perfect. This suggests that financial success ≠ only high rating. Horror can sell well because of:

  • strong concept / marketing
  • cinema experience
  • word-of-mouth
  • “must-watch” hype

4) Vote_count matters (data reliability)

Some movies have very small vote counts (like 21, 39, 48). Those results can still be real, but the audience-rating info is less reliable than movies with thousands of votes (like Saw with 7,650 votes).

This table proves horror is a genre where low-budget movies can become extremely profitable. The top ROI movies earned tens or hundreds of millions with budgets sometimes below $1M. Also, high ROI does not always mean high rating, so audience enjoyment and box-office success are not exactly the same thing.

B3) Which films were the most profitable, and what were their co-genres?

roi_leaders <- movies_fin %>%
  mutate(
    cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror")),
    n_cogenres = purrr::map_int(cogenres, length),
    cogenres_txt = purrr::map_chr(cogenres, ~ ifelse(length(.x) == 0, "None", paste(.x, collapse = ", "))),
    profit = revenue - budget,
    title_wrapped = stringr::str_wrap(title, width = 28),
    tooltip = paste0(
      "<b>", title, "</b> (", year, ")<br>",
      "<b>ROI:</b> ", round(roi, 1), "x<br>",
      "<b>Profit:</b> ", scales::dollar(profit), "<br>",
      "<b>Budget:</b> ", scales::dollar(budget), " | <b>Revenue:</b> ", scales::dollar(revenue), "<br>",
      "<b>#Co-genres:</b> ", n_cogenres, "<br>",
      "<b>Co-genres:</b> ", cogenres_txt, "<br>",
      "<b>Rating:</b> ", vote_average, " (n=", vote_count, ")"
    )
  ) %>%
  arrange(desc(roi)) %>%
  slice_head(n = 25)

p_roi <- ggplot(roi_leaders, aes(x = reorder(title_wrapped, roi), y = roi)) +
  geom_segment(
    aes(xend = title_wrapped, y = 1, yend = roi),
    linewidth = 0.7, alpha = 0.25
  ) +
  geom_point_interactive(
    aes(
      color = n_cogenres,
      size  = profit,
      tooltip = tooltip,
      data_id = id
    ),
    alpha = 0.95
  ) +
  coord_flip(clip = "off") +
  scale_y_log10(
    breaks = c(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
    labels = scales::label_number()
  ) +
  scale_color_viridis_c(option = "D", end = 0.95) +
  scale_size_continuous(range = c(2.5, 8), labels = scales::dollar) +
  labs(
    title = "RQ-B3: Most profitable horror films",
    subtitle = "ROI shown on a log scale • Color = number of co-genres • Size = profit",
    x = NULL,
    y = "ROI = revenue / budget (log scale)",
    color = "# Co-genres",
    size = "Profit",
    caption = "Note: very small budgets can produce extreme ROI values."
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 12),
    legend.position = "right",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 10, 10, 10)  # was wider for the labels
  )

girafe(ggobj = p_roi, width_svg = 11, height_svg = 7) %>%
  girafe_options(
    opts_hover(css = "stroke:#111;stroke-width:2;"),
    opts_tooltip(css = "background-color:white;padding:10px;border:1px solid #999;border-radius:8px;"),
    opts_zoom(max = 4)
  )
# interactive table (cleaner + searchable)
DT::datatable(
  roi_leaders %>%
    transmute(
      title, year,
      roi = round(roi, 1),
      profit = scales::dollar(profit),
      budget = scales::dollar(budget),
      revenue = scales::dollar(revenue),
      rating = vote_average,
      votes = vote_count,
      `#co-genres` = n_cogenres,
      `co-genres` = cogenres_txt
    ),
  options = list(pageLength = 10, scrollX = TRUE)
)

Here we are checking which horror films are the most profitable, using ROI (Return on Investment):

[ = ]

So an ROI of 100 means the movie earned 100× its budget.

  • x-axis = ROI (log scale) → moving right means much higher profitability.
  • y-axis = movie titles (top = highest ROI in this selection).
  • Color = number of co-genres (how many genres appear alongside Horror).
  • Bubble size = absolute profit (revenue − budget), so big bubbles mean huge dollars earned, not just high ROI.

What we see

  • The most extreme ROI values are at the far right, and the top standouts are:

    • One Cut of the Dead
    • The Blair Witch Project
    • Bear Lake Bloodbath
    • Routes
    • Paranormal Activity

    These films achieved massive ROI, usually because they had very small budgets but earned surprisingly high revenue.

  • Paranormal Activity and The Blair Witch Project stand out as classics of the “micro-budget → massive payoff” model. Their bubbles are also relatively large, meaning they are not only high ROI, but also produced large total profit.

  • Many other well-known profitable films appear in the strong-ROI range (still far above 1), such as:

    • Halloween
    • Night of the Living Dead
    • Friday the 13th
    • The Texas Chain Saw Massacre
    • The Hills Have Eyes

    These are films that became major successes relative to cost, reinforcing the idea that horror often generates big returns without blockbuster budgets.

  • Looking at co-genres (colors):

    • Most top-ROI films are in the darker colors, meaning they have few co-genres (often 1–2).
    • A few titles show lighter colors (more co-genres), but they are less common among the very top ROI points.

    → This suggests that the biggest ROI hits are often simple, focused horror concepts (e.g., horror + thriller/mystery) rather than heavily blended multi-genre films.

Overall: the most profitable films (by ROI) are mostly low-budget horror movies that became breakout hits. Their co-genre counts are usually small, meaning extreme profitability often comes from straightforward horror premises rather than highly hybrid genre mixes. Also important: because ROI uses budget in the denominator, very small budgets can create extreme ROI values, which is why the plot warns that micro-budgets can produce “crazy” ROI numbers.


Theme C — Audience reception

C1) Do higher budgets buy higher ratings?

library(dplyr)
library(ggplot2)
library(scales)
library(viridis)

library(dplyr)
library(ggplot2)
library(scales)

rate_df <- movies_fin %>%
  filter(!is.na(vote_average), vote_count >= 50, !is.na(budget), budget > 0)

p_budget_rating_adv2 <- ggplot(rate_df, aes(x = budget, y = vote_average)) +
  stat_density_2d(aes(fill = after_stat(level)), geom = "polygon",
                  contour = TRUE, alpha = 0.35, color = NA) +
  geom_point(alpha = 0.08, size = 0.7) +
  geom_smooth(method = "loess", se = TRUE, linewidth = 1.2, color = "#2F5597") +
  scale_fill_viridis_c(option = "C", guide = "none") +
  scale_x_log10(labels = label_dollar(scale_cut = cut_short_scale())) +
  scale_y_continuous(limits = c(0, 10), breaks = 0:10) +
  labs(
    title = "RQ-C1: Budget vs rating",
    subtitle = "Density shading reveals concentration; smoother includes uncertainty band.\nFiltered to movies with ≥ 50 votes.",
    x = "Budget (log scale)", y = "User rating (0–10)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 11, color = "grey30"),
    panel.grid.minor = element_blank()
  )

p_budget_rating_adv2

Here we are checking whether higher budgets buy higher ratings — meaning: do expensive horror movies get better user scores, or is budget not strongly related to quality (as viewers rate it)?

The x-axis is budget (log scale), so moving right means much larger budgets (each step is a big multiplier). The y-axis is the user rating (0–10). (Also, the plot is filtered to movies with ≥ 50 votes, so ratings are more reliable.)

  • Gray dots = individual movies.
  • Color density blob = where most movies are concentrated (brighter/warmer = more movies there).
  • Blue curve = smooth overall relationship between budget and rating.
  • Gray ribbon around the curve = uncertainty band (wider = less certainty, usually fewer movies in that range).

What we see

  • The strongest concentration of movies is in the mid-to-high budget range (roughly $1M to $100M) with ratings clustered around ~5.5 to ~6.5. → Most horror films in the dataset sit in a “typical zone”: moderate budgets and average-to-decent ratings.

  • The blue trend line increases slightly as budget grows, especially from very small budgets up to moderate budgets. → This suggests a weak positive relationship: bigger budgets are associated with slightly higher ratings.

  • However, the curve flattens for larger budgets (it doesn’t keep climbing sharply). → After a certain point, spending more money doesn’t guarantee much improvement in ratings.

  • The uncertainty band is widest on the far left (very low budgets). → That region likely has fewer films and/or noisier data, so we are less confident about the exact trend there.

  • Even at high budgets, there are still plenty of movies with only average ratings. → High budget does not prevent a film from being rated mediocre.

Overall: higher budgets are linked to slightly better ratings, but the effect is small. Most horror films—cheap or expensive—end up rated around 6-ish, and after moderate budgets the benefit seems to plateau. In other words, money helps a bit, but it doesn’t “buy” great ratings reliably.

C2) Profitability vs ratings (box + jitter)

library(dplyr)
library(ggplot2)
library(scales)

grp_stats <- rate_df %>%
  filter(!is.na(profit_flag), !is.na(vote_average)) %>%
  group_by(profit_flag) %>%
  summarise(
    n = n(),
    med = median(vote_average, na.rm = TRUE),
    .groups = "drop"
  )

# defining the order explicitly (original factor order if it exists; otherwise current order)
lvl <- grp_stats$profit_flag

rate_df2 <- rate_df %>%
  left_join(grp_stats, by = "profit_flag") %>%
  mutate(
    profit_flag_lab = factor(
      paste0(profit_flag, "\n(n=", n, ")"),
      levels = paste0(lvl, "\n(n=", grp_stats$n[match(lvl, grp_stats$profit_flag)], ")")
    )
  )

p_profit_rating_adv <- ggplot(rate_df2, aes(x = profit_flag_lab, y = vote_average, fill = profit_flag)) +
  geom_violin(alpha = 0.18, width = 0.9, color = NA, trim = TRUE) +
  geom_boxplot(outlier.shape = NA, width = 0.45, linewidth = 0.8, alpha = 0.75) +
  geom_jitter(width = 0.16, alpha = 0.12, size = 1.1, color = "grey20") +
  geom_text(
    data = grp_stats %>% mutate(profit_flag_lab = paste0(profit_flag, "\n(n=", n, ")")),
    aes(x = profit_flag_lab, y = med, label = round(med, 2)),
    inherit.aes = FALSE,
    vjust = -0.9,
    size = 3.4,
    fontface = "bold",
    color = "grey20"
  ) +
  scale_fill_manual(
    values = c(
      "Loss" = "#B07AA1",
      "Low profit" = "#4E79A7",
      "Hit" = "#59A14F",
      "Sleeper/Blockbuster" = "#E15759"
    ),
    guide = "none"
  ) +
  scale_y_continuous(limits = c(0, 10), breaks = 0:10, expand = expansion(mult = c(0.02, 0.08))) +
  labs(
    title = "RQ-C2: Do profitable movies get higher ratings?",
    subtitle = "Box + jitter, enhanced with violin density, medians, and sample sizes",
    x = NULL, y = "User rating (0–10)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 11, color = "grey30"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

p_profit_rating_adv

Here we are checking whether profitability is associated with higher ratings — meaning: do horror movies that make more money tend to be rated better by users?

The y-axis is the user rating (0–10), and movies are grouped by profit category on the x-axis.

  • Violin shape = rating density (where ratings are most common in that group).
  • Box plot = middle 50% of ratings (IQR) with the median line.
  • Gray dots = individual movies (jittered so we can see overlap).
  • Number on each box = median rating for that group.
  • (n= …) under each group = sample size (how many movies in that category).

What we see

  • Loss (n=257) has the lowest median rating, around 5.9. → Movies that lose money are, on average, rated slightly lower.

  • Low profit (n=138) has a median around 6.0. → A small improvement, but still very close to the loss group.

  • Hit (n=265) rises to about 6.1 median rating. → Profitable movies tend to score a bit better, but the difference is still modest.

  • Sleeper/Blockbuster (n=221) has the highest median, around 6.3. → The strongest money-makers are also rated a little higher on average.

  • Even though medians increase across groups, the violins and boxes overlap a lot. → That means there are many well-rated movies that weren’t profitable, and many profitable movies with only average ratings.

Overall: profitability is linked to slightly higher ratings, but the effect is not dramatic. The median rating climbs from ~5.9 (loss) to ~6.3 (sleeper/blockbuster), yet the heavy overlap shows that ratings alone don’t strongly predict financial success (and success doesn’t guarantee great ratings either).

C3) Runtime “sweet spot” (ratings vs runtime, plus a binned heatmap)

library(dplyr)
library(ggplot2)
library(scales)
library(viridis)

rt <- movies %>%
  filter(!is.na(runtime), !is.na(vote_average), vote_count >= 50,
         runtime >= 40, runtime <= 220)

# binned mean rating by runtime bins
bin_w <- 5
rt_bins <- rt %>%
  mutate(rt_bin = floor(runtime / bin_w) * bin_w) %>%
  group_by(rt_bin) %>%
  summarise(
    mean_rating = mean(vote_average),
    n = n(),
    .groups = "drop"
  ) %>%
  filter(n >= 20)  # avoid tiny bins

# find peak bin (sweet spot estimate)
peak <- rt_bins %>% slice_max(mean_rating, n = 1)

p_rt_adv <- ggplot(rt, aes(x = runtime, y = vote_average)) +
  # density background (where points concentrate)
  stat_density_2d(aes(fill = after_stat(level)), geom = "polygon",
                  contour = TRUE, alpha = 0.30, color = NA) +
  scale_fill_viridis_c(option = "C", guide = "none") +

  # faint points for context
  geom_point(alpha = 0.06, size = 0.7) +

  # LOESS trend + CI (trustworthy)
  geom_smooth(method = "loess", se = TRUE, linewidth = 1.4,
              color = "#2F5597", fill = "#2F5597", alpha = 0.12) +

  # binned mean line (stable signal)
  geom_line(data = rt_bins, aes(x = rt_bin + bin_w/2, y = mean_rating),
            linewidth = 1.2, color = "#F28E2B") +
  geom_point(data = rt_bins, aes(x = rt_bin + bin_w/2, y = mean_rating, size = n),
             color = "#F28E2B", alpha = 0.85) +
  scale_size_continuous(range = c(1.5, 5.5), guide = guide_legend(title = "Films/bin")) +

  # sweet spot annotation
  geom_vline(xintercept = peak$rt_bin + bin_w/2, linetype = "dashed", alpha = 0.6) +
  annotate("label",
           x = peak$rt_bin + bin_w/2,
           y = peak$mean_rating + 0.35,
           label = paste0("Sweet spot ~ ", peak$rt_bin + bin_w/2, " min\n(mean=", round(peak$mean_rating, 2), ")"),
           label.size = 0.2, size = 3.6) +

  scale_y_continuous(limits = c(0, 10), breaks = 0:10) +
  labs(
    title = "RQ-C3: Is there a runtime sweet spot?",
    subtitle = "Blue: LOESS (± CI) • Orange: binned mean ratings (5-min bins) • Density shows concentration (≥ 50 votes)",
    x = "Runtime (minutes)", y = "User rating (0–10)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    plot.subtitle = element_text(size = 11, color = "grey30"),
    panel.grid.minor = element_blank()
  )

p_rt_adv

Here we are checking whether horror movies have a runtime “sweet spot” — meaning: is there a movie length where ratings tend to be highest?

The x-axis is runtime (minutes) and the y-axis is user rating (0–10). (Also, the plot uses movies with ≥ 50 votes, so ratings are more stable.)

  • Gray dots = individual movies.
  • Purple density shading = where most movies cluster (brighter = more movies there).
  • Blue line = LOESS smooth trend (overall relationship) with a confidence band.
  • Orange points/line = binned mean ratings in 5-minute bins (averages by runtime groups).
  • Orange point size = number of films in that runtime bin (bigger = more films).
  • Vertical dashed line + label = estimated “sweet spot” runtime.

What we see

  • The strongest concentration of movies is around ~80–110 minutes, with ratings mostly around ~5.5–6.5. → This is the “typical horror runtime zone” in the dataset.

  • The orange binned means rise gradually as runtime increases from about ~75 minutes up to ~125 minutes. → Longer films (up to a point) tend to be rated slightly higher on average.

  • The plot highlights a sweet spot around ~127.5 minutes, with an average rating around ~6.79. → In this dataset, movies near ~2 hours (a bit over) get the best average ratings.

  • After about ~130 minutes, the blue smooth line flattens (and even slightly dips at the far right). → Past the sweet spot, making a film even longer does not clearly improve ratings.

  • The confidence band widens at very short and very long runtimes. → There are fewer movies in those extremes, so the estimate is less certain there.

Overall: horror movies show a mild runtime effect: ratings tend to improve from short runtimes toward about ~120–130 minutes, where the plot suggests a sweet spot (~127.5 min). After that, the benefit levels off, meaning very long runtimes don’t consistently earn higher ratings.

2D binned mean rating: runtime × budget

rt2 <- movies_fin %>%
  filter(!is.na(runtime), !is.na(vote_average), vote_count >= 50,
         runtime >= 40, runtime <= 220, budget >= 1e5)

# Making bins manually (more controllable than stat_summary_2d)
bins <- rt2 %>%
  mutate(
    rt_bin = cut(runtime, breaks = seq(40, 220, by = 10), include.lowest = TRUE),
    b_bin = cut(log10(budget), breaks = seq(5, 9, by = 0.25), include.lowest = TRUE)
  ) %>%
  group_by(rt_bin, b_bin) %>%
  summarise(
    mean_rating = mean(vote_average, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  filter(n >= 15)

ggplot(bins, aes(x = b_bin, y = rt_bin, fill = mean_rating)) +
  geom_tile(color = "white", linewidth = 0.2) +
  scale_fill_viridis_c(option = "C", limits = c(3, 8), oob = squish) +
  labs(
    title = "Runtime × budget grid: where do ratings concentrate?",
    subtitle = "Tiles show mean rating (only cells with at least 15 films)",
    x = "Budget bin (log10 USD)", y = "Runtime bin (minutes)", fill = "Mean rating"
  ) +
  theme_minimal(base_size = 11) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This heatmap combines runtime and budget together to see where ratings are stronger.

  • X-axis: budget bins (log10 USD) → moving right means higher budget
  • Y-axis: runtime bins (minutes)
  • Color: mean (average) rating in that cell
  • Only cells with at least 15 films are shown, so the averages are not from very small samples.

What we see

1) Ratings are not highest in the “typical” short runtime area

For 80–90 minutes (bottom row), the colors are more purple/red, meaning ratings are mostly around the mid level (about ~6). So the most common runtime is not automatically the best-rated.

2) The best-rated area is longer runtime + mid/high budget

The most yellow/orange tiles are in the 110–120 minute row (top) and middle-to-high budget bins (right side). This suggests that movies around 110–120 minutes, with bigger budgets, tend to have higher average ratings.

3) Medium runtimes (90–110) are stable but not “top”

For the 90–100 and 100–110 runtime bins, ratings are mostly in the middle colors. This means those runtimes give consistent but not the highest average ratings.

4) Budget alone still doesn’t guarantee high rating

Even in higher budget bins, some tiles are not very bright. So money helps sometimes, but ratings still depend on other things like story, direction, acting, and originality.

Overall,this heatmap shows that the highest mean ratings appear when runtime is around 110–120 minutes and budget is medium to high. Shorter movies (80–90 minutes) are usually rated more average. However, budget alone does not guarantee high rating, because even expensive movies can still get normal ratings.”

C4) Which co-genres are associated with higher ratings? (descriptive)

library(plotly)

rating_by_cogenre <- movies %>%
  filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>%  # reliability filter
  mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
  select(id, title, year, vote_average, vote_count, cogenres) %>%
  tidyr::unnest(cogenres) %>%
  filter(!is.na(cogenres), cogenres != "") %>%
  count(cogenres, wt = vote_count, name = "weighted_n") %>%
  inner_join(
    movies %>%
      filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>%
      mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
      select(vote_average, vote_count, cogenres) %>%
      tidyr::unnest(cogenres) %>%
      filter(!is.na(cogenres), cogenres != "") %>%
      group_by(cogenres) %>%
      summarise(
        mean_rating = mean(vote_average),
        se = sd(vote_average) / sqrt(n()),
        n_movies = n(),
        .groups = "drop"
      ),
    by = "cogenres"
  ) %>%
  filter(n_movies >= 80) %>%
  arrange(desc(mean_rating)) %>%
  slice_head(n = 12)

plot_ly(
  rating_by_cogenre,
  x = ~reorder(cogenres, mean_rating),
  y = ~mean_rating,
  type = "bar",
  text = ~paste0("Mean: ", round(mean_rating,2), "<br>n movies: ", n_movies),
  hoverinfo = "text",
  error_y = list(type = "data", array = ~1.96 * se, visible = TRUE)
) %>%
  layout(
    title = "RQ-C4: Which co-genres tend to have higher ratings?",
    xaxis = list(title = "", tickangle = -30),
    yaxis = list(title = "Mean vote_average (vote_count ≥ 50)"),
    margin = list(b = 120)
  )
library(dplyr)
library(tidyr)
library(plotly)
library(forcats)

rating_by_cogenre <- movies %>%
  filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>%
  mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
  select(vote_average, vote_count, cogenres) %>%
  tidyr::unnest(cogenres) %>%
  filter(!is.na(cogenres), cogenres != "") %>%
  group_by(cogenres) %>%
  summarise(
    n_movies = n(),
    mean_rating = mean(vote_average),
    se = sd(vote_average) / sqrt(n_movies),
    ci = 1.96 * se,
    .groups = "drop"
  ) %>%
  filter(n_movies >= 80) %>%
  arrange(desc(mean_rating)) %>%
  slice_head(n = 12) %>%
  mutate(
    cogenres = fct_reorder(cogenres, mean_rating),
    ci_low  = mean_rating - ci,
    ci_high = mean_rating + ci,
    hover_txt = paste0(
      "<b>", cogenres, "</b><br>",
      "Mean rating: ", sprintf("%.2f", mean_rating), "<br>",
      "95% CI: [", sprintf("%.2f", ci_low), ", ", sprintf("%.2f", ci_high), "]<br>",
      "Movies: ", n_movies,
      "<extra></extra>"
    )
  )

x_max <- max(rating_by_cogenre$ci_high, na.rm = TRUE)

plot_ly(
  rating_by_cogenre,
  x = ~mean_rating,
  y = ~cogenres,
  type = "bar",
  orientation = "h",
  marker = list(
    color = ~mean_rating,
    colorscale = "Viridis",
    showscale = TRUE,
    line = list(color = "rgba(0,0,0,0.25)", width = 1)
  ),
  error_x = list(type = "data", array = ~ci, visible = TRUE),
  hovertext = ~hover_txt,
  hovertemplate = "%{hovertext}"
) %>%
  layout(
    title = list(text = "RQ-C4: Which co-genres tend to have higher ratings?", x = 0.02),
    xaxis = list(
      title = "Mean vote_average (vote_count ≥ 50)",
      range = c(0, x_max + 0.25)
    ),
    yaxis = list(title = ""),
    margin = list(l = 140, r = 60, t = 60, b = 60)
  )

Here we are checking which co-genres tend to have higher ratings — meaning: when Horror is combined with another genre (Drama, Mystery, Thriller, etc.), which combinations are rated best by users?

The x-axis is the average user rating, and each bar is one co-genre.

  • Bar length = average rating for horror movies that include that co-genre.
  • Bar color = also reflects the rating (brighter = higher).
  • Blue error bars = uncertainty around the mean (small = more stable estimate, large = more variability / fewer films).

What we see

  • The highest-rated co-genre combination here is Horror + Drama, with an average rating a bit above 6.1. → Horror films with strong dramatic elements tend to be rated slightly better.

  • Next, Fantasy, Crime, and Mystery also sit near the top (around ~6.0–6.1). → These mixes often add story depth, plot structure, or investigation, which may boost audience satisfaction.

  • Comedy and Thriller are in the middle range (around ~5.8–5.9). → These are very common horror pairings, but they don’t stand out as the highest-rated on average.

  • The lowest-rated co-genres in this list are Action and Science Fiction (closer to ~5.6–5.7). → Horror mixed with action/sci-fi appears slightly less liked on average in this dataset.

  • The differences are not huge overall (roughly a 0.4–0.5 rating spread from bottom to top). → So co-genre matters a bit, but it’s not a dramatic effect.

Overall: the co-genres most associated with higher ratings are Drama, Fantasy, Crime, and Mystery, while Action and Science Fiction trend lower. Still, the gaps are fairly small, meaning co-genre influences ratings only moderately, not massively.


Interactive highlight (Plotly)

This interactive plot lets the viewer hover to see movie titles and key metrics.

fin_small <- movies_fin %>%
  filter(!is.na(vote_average), vote_count >= 200) %>%
  mutate(
    tooltip = paste0(
      "<b>", title, "</b>",
      "<br>Year: ", year,
      "<br>Rating: ", round(vote_average, 1), " (", comma(vote_count), " votes)",
      "<br>Budget: $", comma(round(budget)),
      "<br>Revenue: $", comma(round(revenue)),
      "<br>ROI: ", round(roi, 2), "x"
    )
  )

library(viridisLite)

plotly::plot_ly(
  data = fin_small,
  x = ~budget, y = ~revenue,
  type = "scatter", mode = "markers",
  text = ~tooltip, hoverinfo = "text",
  marker = list(size = 7, opacity = 0.65),
  color = ~vote_average,
  colors = viridisLite::viridis(256)
) %>%
  layout(
    title = "Interactive: Budget vs Revenue (hover for details)",
    xaxis = list(title = "Budget (log)", type = "log"),
    yaxis = list(title = "Revenue (log)", type = "log")
  )

This interactive Plotly chart is the same Budget vs Revenue idea, but now the viewer can hover on any point to see the movie’s details (title, year, rating + vote count, budget, revenue, ROI). This makes the analysis more “real” because we can move from patterns to specific examples.

What we see

1) Clear positive pattern (but not perfect)

Most points follow an upward direction: higher budget usually connects to higher revenue. So spending more money often gives more earning potential.

But the points are still spread out, so budget is not a guarantee.

2) Many movies lose money even with decent budgets

When a point has revenue lower than budget, the ROI will be below 1 (like the example shown: ROI 0.57x). This means some movies with millions of dollars budget still don’t make it back.

3) We can quickly find outliers (big wins and big failures)

With hover, we can identify:

  • Sleeper hits (low budget but high revenue, huge ROI)
  • Over-budget failures (high budget but low revenue) This is harder to do with a static plot.

4) Rating color gives extra insight

Points are colored by vote_average. We can visually check if higher ratings always mean higher revenue. The plot suggests not always—there are high-rated movies that don’t earn much, and some average-rated movies that earn a lot.

Why this interactive plot is useful

This plot helps us explore the relationship between money and success, but also lets us see the exact movies behind extreme points. It supports our story that horror has both: blockbuster-style hits and low-budget surprise wins.”

Overal,The interactive plot shows budget and revenue are generally related, but there are many exceptions. By hovering, we can find movies that are surprise hits (high ROI) and movies that fail to earn back their budget. Also, higher rating does not always mean higher revenue.”


Conclusion

In this project we used the Horror Movies dataset to understand the “story behind horror” from three sides: how the genre changed over time, how money works in horror, and how audiences react.

First, horror clearly became much bigger over the years. The number of releases grows slowly in early decades, but after the 2000s (especially late 2010s) it increases very fast. At the same time, co-genre patterns show that horror keeps mixing with other styles. Thriller stays an important partner genre, while some older combinations (like horror + sci-fi) become less common. Runtime also stays quite stable: most horror movies are around ~90 minutes, but in recent decades we see more longer movies too.

Second, the financial analysis shows that horror is a genre where low-budget films can win big. Budget and revenue are related, but the scatterplots show many exceptions—some high-budget movies still fail, and some small movies become huge hits. The ROI table confirms this strongly: several famous horror movies earned many times their budget, meaning horror can be very profitable even without blockbuster spending. Over time, typical (median) budgets and revenues do not increase smoothly, and recent years can be affected by missing data or industry changes.

Third, audience reception results suggest that money does not buy high ratings. Bigger budgets only slightly improve ratings at low levels, but after medium budgets the ratings mostly stay around the same level. Profitable movies have a little higher ratings on average, but there is a lot of overlap, so financial success and audience satisfaction are not the same thing. For runtime, we found a small “sweet spot”: longer movies (around 110–160 minutes) can have slightly higher average ratings, and the heatmap shows that the best mean ratings often appear in the longer runtime + medium/high budget area.

Overall, our results tell one clear story: horror is a flexible genre that grows fast, experiments with styles, and can generate surprising profits, but audience ratings depend more on creative choices than just budget.


7) Reproducibility

sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] visNetwork_2.1.2  viridis_0.6.5     viridisLite_0.4.2 DT_0.33          
##  [5] ggiraph_0.9.2     ggrepel_0.9.6     patchwork_1.3.2   plotly_4.11.0    
##  [9] ggExtra_0.11.0    ggridges_0.5.7    scales_1.4.0      lubridate_1.9.3  
## [13] forcats_1.0.0     stringr_1.6.0     dplyr_1.1.4       purrr_1.2.0      
## [17] readr_2.1.5       tidyr_1.3.1       tibble_3.3.0      ggplot2_4.0.1    
## [21] tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.54               bslib_0.9.0            
##  [4] htmlwidgets_1.6.4       lattice_0.22-6          tzdb_0.4.0             
##  [7] crosstalk_1.2.2         vctrs_0.6.5             tools_4.4.2            
## [10] generics_0.1.4          parallel_4.4.2          pkgconfig_2.0.3        
## [13] Matrix_1.7-1            data.table_1.17.8       RColorBrewer_1.1-3     
## [16] S7_0.2.0                lifecycle_1.0.4         compiler_4.4.2         
## [19] farver_2.1.2            httpuv_1.6.15           fontquiver_0.2.1       
## [22] fontLiberation_0.1.0    htmltools_0.5.8.1       sass_0.4.10            
## [25] yaml_2.3.10             lazyeval_0.2.2          crayon_1.5.3           
## [28] later_1.4.4             pillar_1.11.1           jquerylib_0.1.4        
## [31] MASS_7.3-61             cachem_1.1.0            nlme_3.1-166           
## [34] mime_0.13               fontBitstreamVera_0.1.1 tidyselect_1.2.1       
## [37] digest_0.6.38           stringi_1.8.7           labeling_0.4.3         
## [40] splines_4.4.2           fastmap_1.2.0           grid_4.4.2             
## [43] cli_3.6.5               magrittr_2.0.4          utf8_1.2.6             
## [46] withr_3.0.2             gdtools_0.4.4           promises_1.5.0         
## [49] bit64_4.5.2             timechange_0.3.0        rmarkdown_2.30         
## [52] httr_1.4.7              igraph_2.1.2            bit_4.5.0              
## [55] otel_0.2.0              gridExtra_2.3           hms_1.1.3              
## [58] shiny_1.9.1             evaluate_1.0.5          knitr_1.50             
## [61] miniUI_0.1.1.1          mgcv_1.9-1              rlang_1.1.6            
## [64] isoband_0.2.7           Rcpp_1.1.0              xtable_1.8-4           
## [67] glue_1.8.0              vroom_1.6.5             rstudioapi_0.17.1      
## [70] jsonlite_2.0.0          R6_2.6.1                systemfonts_1.3.1