0) Project goal + research questions
We use the Horror Movies dataset (Kaggle) to answer
three big themes with course-style visual storytelling:
Theme A — Evolution (time, sub-genres, runtime)
- RQ-A1: How did the number of horror releases change
over time (and where are the “boom” years)?
- RQ-A2: Did horror become more “hybrid” over time
(more co-genres per film)?
- RQ-A3: Which co-genres (Thriller, Mystery,
Comedy…) rose and fell over decades?
- RQ-A4: Did runtime distributions shift across
decades?
- RQ-A5: What does the co-genre ecosystem look like
(as a network)?
Theme B — Economics (budget, revenue, sleeper hits)
- RQ-B1: How strongly are budget and revenue
linked—and where do we find surprising hits (high ROI on low
budgets)?
- RQ-B2: Is the “typical” budget (and revenue)
trending upward over time?
- RQ-B3 Which films were the most profitable, and
what were their co-genres?
Theme C — Audience reception (ratings vs
budget/profitability/runtime)
- RQ-C1: Do higher budgets buy higher ratings?
- RQ-C2: Do more profitable movies get higher
ratings?
- RQ-C3: Is there a runtime “sweet spot” for audience
ratings?
- RQ-C4: Which co-genres are associated with higher
ratings? (descriptive)
Notes: Financial fields often contain zeros. In this report, we treat
0 budget/revenue/runtime as missing.
1) Libraries + data import
library(tidyverse)
library(lubridate)
library(scales)
library(stringr)
library(tidyr)
library(dplyr)
# Advanced viz
library(ggridges) # ridge plots
library(ggExtra) # marginal plots
library(plotly) # interactive graphics
library(patchwork) # arranging multiple plots
library(ggrepel) # nicer labels/annotations
library(ggiraph)
library(DT)
library(ggplot2)
library(viridis)
setwd('/Users/majid/Documents/3-third semester/Advance R/RVisualization-Horror-Movie-main/data/raw/')
movies_raw <- readr::read_csv("horror_movies.csv", show_col_types = FALSE)
glimpse(movies_raw)
## Rows: 32,540
## Columns: 21
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ id <dbl> 760161, 760741, 882598, 756999, 772450, 1014226, 717…
## $ original_title <chr> "Orphan: First Kill", "Beast", "Smile", "The Black P…
## $ title <chr> "Orphan: First Kill", "Beast", "Smile", "The Black P…
## $ original_language <chr> "en", "en", "en", "en", "es", "es", "en", "en", "en"…
## $ overview <chr> "After escaping from an Estonian psychiatric facilit…
## $ tagline <chr> "There's always been something wrong with Esther.", …
## $ release_date <date> 2022-07-27, 2022-08-11, 2022-09-23, 2022-06-22, 202…
## $ poster_path <chr> "/pHkKbIRoCe7zIFvqan9LFSaQAde.jpg", "/xIGr7UHsKf0URW…
## $ popularity <dbl> 5088.584, 2172.338, 1863.628, 1071.398, 1020.995, 93…
## $ vote_count <dbl> 902, 584, 114, 2736, 83, 1, 125, 1684, 73, 1035, 637…
## $ vote_average <dbl> 6.9, 7.1, 6.8, 7.9, 7.0, 1.0, 5.8, 7.0, 6.5, 6.8, 7.…
## $ budget <dbl> 0, 0, 17000000, 18800000, 0, 0, 20000000, 68000000, …
## $ revenue <dbl> 9572765, 56000000, 45000000, 161000000, 0, 0, 289259…
## $ runtime <dbl> 99, 93, 115, 103, 0, 0, 88, 130, 90, 106, 98, 89, 97…
## $ status <chr> "Released", "Released", "Released", "Released", "Rel…
## $ adult <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ backdrop_path <chr> "/5GA3vV1aWWHTSDO5eno8V5zDo8r.jpg", "/2k9tBql5GYH328…
## $ genre_names <chr> "Horror, Thriller", "Adventure, Drama, Horror", "Hor…
## $ collection <dbl> 760193, NA, NA, NA, NA, NA, 94899, NA, NA, 950289, N…
## $ collection_name <chr> "Orphan Collection", NA, NA, NA, NA, NA, "Jeepers Cr…
2) Cleaning + feature engineering
movies <- movies_raw %>%
mutate(
release_date = as.Date(release_date),
year = year(release_date),
decade = floor(year / 10) * 10,
runtime = na_if(runtime, 0),
budget = na_if(budget, 0),
revenue = na_if(revenue, 0),
vote_average = na_if(vote_average, 0),
vote_count = na_if(vote_count, 0),
# split genre_names like "Horror, Mystery, Thriller"
genre_list = str_split(genre_names %||% "", ",\\s*"),
n_genres = lengths(genre_list)
) %>%
filter(!is.na(year), year >= 1950, year <= max(year, na.rm = TRUE))
# Helper dataset for economics / ratings analysis
movies_fin <- movies %>%
filter(!is.na(budget), !is.na(revenue), budget > 0, revenue > 0) %>%
mutate(
profit = revenue - budget,
roi = revenue / budget,
log_budget = log10(budget),
log_revenue = log10(revenue),
profit_flag = case_when(
profit < 0 ~ "Loss",
roi < 2 ~ "Low profit",
roi < 5 ~ "Hit",
TRUE ~ "Sleeper/Blockbuster"
) %>% factor(levels = c("Loss","Low profit","Hit","Sleeper/Blockbuster"))
)
summary(select(movies, year, runtime, vote_average, n_genres))
## year runtime vote_average n_genres
## Min. :1950 Min. : 1.00 Min. : 0.500 Min. : 1.000
## 1st Qu.:2000 1st Qu.: 26.00 1st Qu.: 4.100 1st Qu.: 1.000
## Median :2012 Median : 83.00 Median : 5.200 Median : 2.000
## Mean :2007 Mean : 67.69 Mean : 5.191 Mean : 1.991
## 3rd Qu.:2018 3rd Qu.: 92.00 3rd Qu.: 6.100 3rd Qu.: 3.000
## Max. :2022 Max. :683.00 Max. :10.000 Max. :16.000
## NA's :2668 NA's :11629
summary(select(movies_fin, budget, revenue, profit, roi))
## budget revenue profit
## Min. : 1 Min. : 1 Min. :-194775779
## 1st Qu.: 1000000 1st Qu.: 797276 1st Qu.: -127461
## Median : 5375000 Median : 11009988 Median : 2729260
## Mean : 12642663 Mean : 38559968 Mean : 25917305
## 3rd Qu.: 15000000 3rd Qu.: 44975559 3rd Qu.: 29052546
## Max. :200000000 Max. :701842551 Max. : 666842551
## roi
## Min. : 0.000
## 1st Qu.: 0.667
## Median : 2.013
## Mean : 26.100
## 3rd Qu.: 4.830
## Max. :6666.667
Theme A — Evolution over time
A1) Releases per year (area chart + “boom years” annotations)
by_year <- movies %>%
count(year, name = "n_movies")
# Find "boom years":top year in each decade
boom_years <- by_year %>%
mutate(decade = floor(year / 10) * 10) %>%
group_by(decade) %>%
slice_max(n_movies, n = 1, with_ties = FALSE) %>%
ungroup()
p_releases <- ggplot(by_year, aes(x = year, y = n_movies)) +
geom_area(alpha = 0.7) +
geom_smooth(se = FALSE, linewidth = 1) +
geom_point(data = boom_years, size = 2) +
ggrepel::geom_text_repel(
data = boom_years,
aes(label = paste0(year, ": ", comma(n_movies))),
size = 3,
max.overlaps = 20
) +
scale_y_continuous(labels = comma) +
labs(
title = "RQ-A1: Horror releases over time",
subtitle = "Area chart with a smooth trend and the highest-release year in each decade",
x = NULL, y = "Number of releases"
) +
theme_minimal(base_size = 12)
p_releases

In this plot we count how many horror movies were released
each year. The grey area shows the number of
releases, and the blue smooth line shows the general
trend.
What we see:
- From the 1950s to the 1980s, the number of horror
releases is low and slowly increasing. For example, one
“boom year” in the 1950s is around 1958 (75 movies),
which is small compared to later decades.
- In the 1970s and 1980s, the numbers become
more stable and higher, but still not very big. We see
a boom year like 1972 (209) and later 1988
(413).
- After 2000, the growth becomes much
faster. The curve goes up strongly, and a boom year like
2009 (808) is already much bigger than all earlier
decades.
- The biggest change is after 2015. Releases increase
very quickly, reaching very high values in the late 2010s and
2020.
- The highest point is 2020 (~2,100 movies), and
2019 (~1,895) is also extremely high. This shows a
clear “boom” period.
Overall, horror movies became more and more common over
time, but the real explosion happens in the last 15–20
years. The genre looks like it is growing a
lot, especially in the late 2010s and around 2020. If you see a
small drop after 2020 in data, it can be because recent years
may be incomplete in the dataset or because the movie industry
changed during that time.
A2) Genre hybridization (how many genres per film over time?)
library(dplyr)
library(ggplot2)
library(scales)
hybrid <- movies %>%
filter(!is.na(year), !is.na(n_genres)) %>%
group_by(year) %>%
summarise(
avg_n_genres = mean(n_genres),
sd_n_genres = sd(n_genres),
n = n(),
se = sd_n_genres / sqrt(n),
ci = 1.96 * se,
lo = avg_n_genres - ci,
hi = avg_n_genres + ci,
.groups = "drop"
)
p_hybrid <- ggplot(hybrid, aes(x = year, y = avg_n_genres)) +
# uncertainty ribbon
geom_ribbon(aes(ymin = lo, ymax = hi), fill = "#4C78A8", alpha = 0.18) +
# raw yearly line
geom_line(color = "grey25", alpha = 0.55, linewidth = 0.9) +
geom_point(color = "grey25", alpha = 0.55, size = 1.2) +
# smooth trend line
geom_smooth(method = "loess", se = FALSE, color = "#2F5597", linewidth = 1.6) +
# film count
geom_line(aes(y = rescale(n, to = range(avg_n_genres, na.rm = TRUE))),
color = "#F28E2B", linewidth = 0.9, alpha = 0.6) +
annotate("text",
x = min(hybrid$year, na.rm = TRUE) + 2,
y = max(hybrid$avg_n_genres, na.rm = TRUE),
label = "Orange line = relative film count",
hjust = 0, vjust = 1.2, size = 3.6, color = "#F28E2B") +
scale_y_continuous(
breaks = pretty_breaks(),
expand = expansion(mult = c(0.02, 0.06))
) +
scale_x_continuous(breaks = pretty_breaks(6)) +
labs(
title = "RQ-A2: Is horror becoming more hybrid?",
subtitle = "Average number of listed genres per film • ribbon = 95% CI (yearly mean)",
x = NULL, y = "Avg. number of genres per film"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 12, color = "grey30"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
p_hybrid
Here we are checking genre hybridization, meaning:
does a horror movie usually come with only “Horror”, or is it mixed
with other genres like Thriller, Mystery, Comedy, Drama, etc.? The
y-axis is the average number of genres listed
per film each year.
- Black line + dots = real yearly averages (can be
noisy year-to-year).
- Blue thick line = smooth trend (overall direction
across decades).
- Light gray ribbon = 95% confidence
interval around the yearly mean (wider = more uncertainty,
usually fewer films).
- Orange line = relative film count
(how many films you have each year, scaled so it can be shown on the
same plot).
What we see
In the 1950s, the average sits around
~2.2–2.3 genres per film, but the gray ribbon
is very wide. → This means early years are more uncertain
(likely fewer movies in the dataset).
Through the 1960s–1970s, the average stays
pretty stable (still around ~2.1–2.2).
→ Horror is commonly tagged as Horror + one more
genre.
In the late 1970s to mid-1980s, the plot shows
the highest hybridization (peaks close to
~2.4–2.5). → This is the period where horror films are
most often labeled with multiple co-genres (e.g.,
Horror–Sci-Fi, Horror–Mystery, Horror–Thriller).
After the late 1980s / early 1990s, the
blue trend starts declining.
From about 2000 onward, the average drops more
clearly, reaching around ~1.9–2.0 by the late
2010s / early 2020s. → In the dataset, recent horror movies are
tagged with fewer genres, so they look less
hybrid overall.
Meanwhile, the orange line rises strongly after
~2005, meaning the dataset contains many more films in
recent years. → So the decrease in genres per film
happens even while film counts increase, which supports
the idea that this is not just random noise.
Overall: based on this dataset, horror movies look
most hybrid in the 1980s, and less hybrid in
the last 20–25 years (fewer genres listed per movie). Just like
you noted before, part of this trend could reflect changes in
tagging practices (how platforms label genres over time), not
only a true change in filmmaking.
A3) Co-genre trends (stacked area shares)
We treat sub-genre as the co-genres that
appear with Horror (e.g., Thriller, Mystery).
genres_long <- movies %>%
select(id, year, genre_list) %>%
unnest_longer(genre_list, values_to = "genre") %>%
mutate(genre = str_trim(genre)) %>%
filter(genre != "", !is.na(genre))
# Keep top co-genres excluding Horror
top_cogenres <- genres_long %>%
filter(genre != "Horror") %>%
count(genre, sort = TRUE) %>%
slice_head(n = 8) %>%
pull(genre)
cogenre_year <- genres_long %>%
mutate(genre2 = if_else(genre %in% top_cogenres, genre, "Other")) %>%
filter(genre2 != "Horror") %>%
count(year, genre2) %>%
group_by(year) %>%
mutate(share = n / sum(n)) %>%
ungroup()
p_cogenre <- ggplot(cogenre_year, aes(x = year, y = share, fill = genre2)) +
geom_area(position = "fill", alpha = 0.9) +
scale_y_continuous(labels = percent) +
labs(
title = "RQ-A3: Which co-genres rose and fell?",
subtitle = "Share of co-genres among horror movies (top 8 + Other)",
x = NULL, y = "Share (within year)", fill = NULL
) +
theme_minimal(base_size = 12) +
theme(legend.position = "right")
p_cogenre

This stacked area chart shows which co-genres are most common
together with Horror, and how their shares change over
time. Important: the y-axis is percent within each
year, so every year always adds up to 100%.
This means we are comparing composition, not the total
number of movies.
How to read it
- Each color = one co-genre (Thriller, Mystery, Sci-Fi, etc.)
- Bigger band = that co-genre appears more often with
Horror in that period
- “Other” is a big group of many small genres combined
Main results
1) “Other” is always the biggest part
The blue “Other” area stays very large in all years. This tells us
horror movies mix with many different small genres, not
only the famous ones. So the genre is diverse and not
limited to just a few combinations.
2) Thriller becomes stronger in recent decades
The pink “Thriller” part is visible in all years, but it becomes
more stable and a bit larger in the later years
(especially after 2000). So modern horror is often horror +
thriller, meaning more suspense, tension, and psychological
fear.
3) Science Fiction is strong in early decades, then drops
The purple “Science Fiction” band looks bigger in earlier decades
(around 1950s–1960s), and later it becomes smaller. So old horror had
more horror + sci-fi (like monsters, aliens,
experiments), but this combination is less dominant
now.
4) Mystery and Drama are present but not dominant
The light blue “Mystery” and green “Drama” appear across time, but
they usually stay medium/small. This suggests that
horror sometimes uses mystery storytelling or emotional drama, but they
are not the main co-genres compared to Thriller (and the big
“Other”).
5) Comedy/Action/Crime/Fantasy are smaller but show “flavor
changes”
These genres stay relatively small, but they move up and down a
little. This means some periods prefer special styles like
horror-comedy, action horror, or
fantasy-like horror, but they are not the core trend.
Overall, horror movies are mixed with many different genres (big
‘Other’ area). Over time, Thriller becomes one of the most
common partners of Horror, while Sci-Fi mixed horror
was more common in older decades and becomes smaller later.
This shows horror changes style depending on the period.
A4) Runtime evolution (ridge plot by decade)
runtime_decade <- movies %>%
filter(!is.na(runtime), runtime >= 40, runtime <= 220, decade >= 1960) %>%
mutate(decade = factor(decade))
p_runtime <- ggplot(runtime_decade, aes(x = runtime, y = decade, fill = after_stat(x))) +
ggridges::geom_density_ridges_gradient(scale = 2.2, rel_min_height = 0.01, size = 0.3) +
scale_fill_viridis_c(option = "C", name = "Runtime") +
labs(
title = "RQ-A4: Runtime distributions across decades",
subtitle = "Ridge plot (trimmed to 40–220 minutes to reduce outliers)",
x = "Runtime (minutes)", y = "Decade"
) +
theme_minimal(base_size = 12)
p_runtime

This ridge plot shows how horror movie runtimes are
distributed in each decade (each “ridge” is one decade). The
x-axis is runtime in minutes. The shape tells us where
most movies are concentrated. (We trimmed the data to 40–220
minutes to remove extreme outliers.)
What we see
Most horror movies are around 85–100 minutes in
almost every decade. The highest part of each ridge is usually near
~90 minutes, so this looks like the “classic” horror
length.
In older decades (1960s–1970s), the ridge is a
bit more spread out and sometimes has small bumps. This means runtime
was less consistent and there were more movies that are
shorter or longer compared to the main peak.
From the 1980s to 2000s, the distributions
become more concentrated around 90 minutes. So horror
becomes more standardized, like producers follow a common
format.
In the 2010s and 2020s, the main peak is still
around 90 minutes, but we also see a longer right tail
(more movies above 110–130 minutes). That suggests recent decades
include more long-form horror (often psychological /
story-driven) besides the typical short runtime.
Simply,Across decades, horror movies usually stay close to
about 90 minutes, so this is like the normal runtime of
the genre. But in newer decades, we see more movies that are
longer than before, meaning modern horror sometimes
wants more time for story, tension, and slow build-up. Still, the
majority remains around the same classic length. Because the dataset
includes many movies in recent years, the ridge shapes for 2010s/2020s
may be smoother and more detailed (more data points), while early
decades can look more “bumpy” because there are fewer films
recorded.
A5) What does the co-genre ecosystem look like (as a network)?
library(visNetwork)
# build co-genre pairs per movie (excluding Horror)
pairs_df <- movies %>%
mutate(cogenres = purrr::map(genre_list, ~ sort(unique(setdiff(.x, "Horror"))))) %>%
mutate(pairs = purrr::map(cogenres, ~ {
if (length(.x) < 2) return(tibble::tibble(from=character(), to=character()))
m <- t(combn(.x, 2))
tibble::tibble(from = m[,1], to = m[,2])
})) %>%
select(id, pairs) %>%
tidyr::unnest(pairs)
edges <- pairs_df %>%
count(from, to, sort = TRUE, name = "weight") %>%
filter(weight >= 120) # tune threshold to avoid clutter
node_stats <- movies %>%
filter(!is.na(vote_average), !is.na(vote_count)) %>%
mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
select(vote_average, vote_count, cogenres) %>%
tidyr::unnest(cogenres) %>%
filter(!is.na(cogenres), cogenres != "") %>%
group_by(cogenres) %>%
summarise(
freq = n(),
mean_rating = mean(vote_average, na.rm = TRUE),
.groups = "drop"
)
nodes <- tibble::tibble(id = unique(c(edges$from, edges$to))) %>%
left_join(node_stats, by = c("id" = "cogenres")) %>%
mutate(
label = id,
value = freq,
title = paste0("<b>", id, "</b><br>Movies: ", freq, "<br>Mean rating: ", round(mean_rating,2))
)
visNetwork(nodes = as.data.frame(nodes), edges = as.data.frame(edges)) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
visEdges(smooth = FALSE) %>%
visPhysics(stabilization = TRUE) %>%
visInteraction(hover = TRUE)
Here we are checking the co-genre ecosystem as a
network — meaning: which genres tend to appear together
with Horror, and how are those co-genres connected to each
other?
In this network, each node is a genre that co-occurs
with Horror, and each edge (line) means two genres
appear together in the same movie (alongside
Horror).
- Node size = how common that co-genre is (bigger =
appears with Horror more often).
- Lines (edges) = co-occurrence links (genres that
frequently show up together in the same films).
- Clusters / proximity = genres that often mix
together form tight groups.
What we see
Thriller is the main hub (largest node, most
connections). → This means Horror is most often combined with
Thriller, and Thriller also connects strongly to many
other genres (it acts like a “bridge” genre).
Around Thriller, we see other highly connected
genres like Mystery, Drama, Comedy, and Science
Fiction. → These genres appear often with Horror and also
combine with each other, creating a dense “core” of hybrid
horror.
The network has a clear core–periphery
structure:
- Core (dense center): Thriller, Mystery, Drama,
Comedy, Sci-Fi, Action, Fantasy → lots of links, many hybrid
combinations.
- Periphery (outer nodes): Romance, Animation, TV
Movie, Adventure, Crime → fewer links and/or less frequent
co-occurrence.
Comedy and Drama connect into the core, showing
that horror often branches into:
- Horror–Comedy (camp, parody, dark comedy)
- Horror–Drama (psychological / character-driven
horror)
Science Fiction and Action also sit close to the
core and connect broadly. → This suggests a strong
“horror–sci-fi/action” lane (monsters, apocalyptic
stories, creature features).
Some genres look more specialized:
- Animation is more isolated (few links). → Horror
animation exists, but it’s rarer and less mixed.
- Romance also sits on the edge. → Horror–romance
combos happen, but they’re not a dominant pattern.
Overall: the co-genre ecosystem shows a
dense central cluster dominated by Thriller
(the strongest partner of Horror), with
Mystery/Drama/Comedy/Sci-Fi forming the main supporting
mix. Outside this core are more niche pairings like
Animation and Romance, which occur
less often and connect less broadly.
Theme B — Economics
B1) Budget vs revenue (scatter + marginal distributions) + “sleeper
hits”
fin <- movies_fin %>%
filter(budget >= 1e5, revenue >= 1e5) %>% # reduce extreme noise from tiny numbers
mutate(
sleeper = roi >= quantile(roi, 0.95, na.rm = TRUE) & budget <= quantile(budget, 0.40, na.rm = TRUE)
)
p_scatter <- ggplot(fin, aes(x = budget, y = revenue)) +
geom_point(aes(alpha = sleeper), size = 1.8) +
scale_x_log10(labels = label_number(scale_cut = cut_short_scale())) +
scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
labs(
title = "RQ-B1: Budget vs revenue (log scale) + sleeper hits",
subtitle = "Sleeper hit = top 5% ROI AND bottom 40% budget (heuristic)",
x = "Budget (log10)", y = "Revenue (log10)", alpha = "Sleeper hit"
) +
theme_minimal(base_size = 12)
ggExtra::ggMarginal(p_scatter, type = "histogram", bins = 30)

This plot compares budget vs revenue for horror
movies. Both axes are in log scale, so we can see small
and big movies together (because budgets and revenues are very
spread).
- Each dot = one movie
- X-axis = budget (log10) → more to the right =
higher budget
- Y-axis = revenue (log10) → higher = more
revenue
- The top and right histograms show the distributions
of budget and revenue.
- Black points = “sleeper hits” (our rule:
top 5% ROI and bottom 40%
budget).
What we see
2) Most horror movies are “mid budget” in the dataset
From the top histogram, the highest bars are around the middle
budgets (not extremely small and not extremely huge). So most horror
films here are not blockbuster-level, but also not extremely tiny.
3) Revenue distribution has a long tail
On the right histogram, we see many movies earn “normal” amounts, but
a smaller number reach very high revenue (tens/hundreds of millions). So
the genre has some very big winners, but they are not
the majority.
4) Sleeper hits appear on the “left but high” area
The black dots are mostly left side (lower budgets)
but high on y-axis (high revenue). This matches the
idea of sleeper hits: they didn’t cost much, but they
earned a lot, so their ROI becomes huge.
5) Key message
Horror is a genre where low-budget movies can still become
very profitable. Compared to some other genres, horror can
succeed with smaller budgets because it relies more on
atmosphere, story, and suspense rather than expensive
special effects. (This is a logical interpretation based on the
pattern.)
Overall,This chart shows budget and revenue are connected, but not
strongly. Many movies with medium or even low budget can still earn very
high revenue. The black points (sleeper hits) prove that horror can give
surprising success even with small budgets.”
Identify the “most surprising” wins (ROI leaders)
top_roi <- fin %>%
arrange(desc(roi)) %>%
select(title, year, budget, revenue, profit, roi, vote_average, vote_count) %>%
slice_head(n = 15)
top_roi
## # A tibble: 15 × 8
## title year budget revenue profit roi vote_average vote_count
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Paranormal Activity 2007 2.15e5 1.94e8 1.94e8 903. 6 4309
## 2 Lady Frankenstein 1971 2 e5 1.40e8 1.39e8 698. 5 39
## 3 The Gallows 2015 1 e5 4.27e7 4.26e7 427. 5 841
## 4 Open Water 2004 1.3 e5 5.47e7 5.45e7 421. 5.5 905
## 5 Night of the Livin… 1968 1.14e5 3 e7 2.99e7 263. 7.6 1935
## 6 Halloween 1978 3.25e5 7.03e7 6.99e7 216. 7.6 4421
## 7 The Legend of Bogg… 1972 1.6 e5 2.20e7 2.18e7 138. 5 48
## 8 The Hills Have Eyes 1977 2.3 e5 2.5 e7 2.48e7 109. 6.2 693
## 9 Friday the 13th 1980 5.50e5 5.98e7 5.92e7 109. 6.4 2289
## 10 The Devil Inside 2012 1 e6 1.02e8 1.01e8 102. 4.7 678
## 11 Taxiwala 2018 1 e6 1 e8 9.9 e7 100 6.1 21
## 12 The Rocky Horror P… 1975 1.20e6 1.13e8 1.12e8 94.1 7.5 2326
## 13 The Quiet Ones 2014 2 e5 1.78e7 1.76e7 89.2 5.1 584
## 14 Saw 2004 1.20e6 1.04e8 1.03e8 86.6 7.4 7650
## 15 Dawn of the Dead 1978 6.5 e5 5.50e7 5.44e7 84.6 7.5 1674
This table is showing the “most surprising wins” in
horror, meaning movies with the highest ROI (Return on
Investment).
What ROI means
- ROI ≈ revenue / budget
- So ROI = 100x means the movie made about
100 times its budget (very rare in most genres).
What we learn from this table
1) Horror can produce crazy high ROI with small
budgets
Look at the top rows:
Paranormal Activity (2007): budget around
$215k, revenue around $194M, ROI about
903x → This is a perfect example of a “small movie →
huge success”.
The Gallows (2015): budget
$100k, revenue $42.7M, ROI
~427x
Open Water (2004): budget
$130k, revenue $54.7M, ROI
~421x
These are “surprising” because they didn’t need big money to become
profitable.
2) Classic horror franchises also show strong ROI
Some famous titles appear too:
- Halloween (1978): ROI ~216x
- Friday the 13th (1980): ROI
~109x
- Saw (2004): ROI ~86.6x
So even well-known horror classics were originally made
cheaply compared to their revenue.
3) Ratings are not always super high, but profits are huge
Many of these movies have ratings around 5–7, not
necessarily perfect. This suggests that financial success ≠ only
high rating. Horror can sell well because of:
- strong concept / marketing
- cinema experience
- word-of-mouth
- “must-watch” hype
4) Vote_count matters (data reliability)
Some movies have very small vote counts (like 21,
39, 48). Those results can still be real, but the audience-rating info
is less reliable than movies with thousands of votes
(like Saw with 7,650 votes).
This table proves horror is a genre where low-budget movies can
become extremely profitable. The top ROI movies earned tens or hundreds
of millions with budgets sometimes below $1M. Also, high ROI does not
always mean high rating, so audience enjoyment and box-office success
are not exactly the same thing.
B2) Are budgets and revenues trending upward?
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
trend2 <- movies_fin %>%
filter(!is.na(year)) %>%
group_by(year) %>%
summarise(
n_budget = sum(!is.na(budget)),
n_revenue = sum(!is.na(revenue)),
med_budget = median(budget, na.rm = TRUE),
q25_budget = quantile(budget, 0.25, na.rm = TRUE),
q75_budget = quantile(budget, 0.75, na.rm = TRUE),
med_revenue = median(revenue, na.rm = TRUE),
q25_revenue = quantile(revenue, 0.25, na.rm = TRUE),
q75_revenue = quantile(revenue, 0.75, na.rm = TRUE),
.groups = "drop"
) %>%
# keep years with enough data for BOTH measures
filter(n_budget >= 15, n_revenue >= 15) %>%
mutate(year = as.integer(year))
# long format so we can facet or color cleanly
trend_long <- bind_rows(
trend2 %>% transmute(year, metric = "Budget", med = med_budget, lo = q25_budget, hi = q75_budget, n = n_budget),
trend2 %>% transmute(year, metric = "Revenue", med = med_revenue, lo = q25_revenue, hi = q75_revenue, n = n_revenue)
)
p_budget_trend_adv <- ggplot(trend_long, aes(x = year, y = med, color = metric, fill = metric)) +
geom_ribbon(aes(ymin = lo, ymax = hi), alpha = 0.18, colour = NA) +
geom_line(linewidth = 1.3) +
geom_point(aes(size = n), alpha = 0.6) +
scale_y_log10(
labels = label_dollar(scale_cut = cut_short_scale()),
breaks = c(1e5, 3e5, 1e6, 3e6, 1e7, 3e7, 1e8, 3e8, 1e9)
) +
scale_size_continuous(range = c(1.2, 5.5), guide = guide_legend(title = "Films/year", order = 3)) +
scale_color_manual(values = c("Budget" = "#E15759", "Revenue" = "#4E79A7")) +
scale_fill_manual(values = c("Budget" = "#E15759", "Revenue" = "#4E79A7")) +
labs(
title = "RQ-B2: Typical budgets and revenues over time",
subtitle = "Lines = median • ribbons = IQR (25–75%) • point size = number of films with non-missing values\nLog scale improves readability across decades",
x = NULL, y = "USD (log scale)",
color = NULL, fill = NULL
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 11, color = "grey30"),
legend.position = "top",
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
p_budget_trend_adv
Here we are checking how typical horror movie budgets and
revenues change over time — basically: are horror movies
getting more expensive to make, and are they earning more
money?
The y-axis is USD on a log scale, which means equal
vertical gaps represent multiplicative changes (e.g.,
$1M → $10M is the same “distance” as $10M → $100M).
- Red line = median budget each year
(typical budget).
- Blue line = median revenue each
year (typical revenue).
- Colored ribbons = IQR (25%–75%) →
the “middle half” of movies (how spread out budgets/revenues are).
- Point size = number of films that
year with non-missing budget/revenue (bigger = more data that
year).
What we see
1980s → early/mid 1990s: both typical
budgets (red) and typical revenues (blue)
increase gradually. → Horror films become more
expensive and also earn more, with revenue
generally staying above budget.
Late 1990s → early 2000s: both lines show
noticeable year-to-year jumps (especially revenue). →
This suggests some years have bigger breakout hits
pulling the median up, or the data becomes more variable.
2000s → 2010s: the median
budget stays fairly flat (around the same
level), while median revenue fluctuates but is often
higher than budget. → Typical horror movies don’t
become massively more expensive, but they still often gross more
than they cost.
The ribbons (IQR bands), especially for
revenue (blue), are often quite wide. → Revenues are
much more spread out than budgets: some films make
modest money, while others become major hits.
Around 2020–2021: there is a huge
drop in both budget and revenue, followed by a bounce back. →
This is very likely a data disruption / missingness
effect (and in real life, also consistent with the pandemic era
affecting releases and box office). The extremely wide ribbon here also
signals high uncertainty and possibly fewer
reliable observations.
Overall: typical budgets and revenues generally
rise from the 1980s into the 1990s, then
stabilize with fluctuations from the 2000s onward.
Revenues usually stay above budgets, but revenue is
far more variable, meaning horror has a “hit-or-miss”
earnings pattern. The sharp dip around 2020–2021 should
be interpreted cautiously because it may reflect unusual
conditions and/or incomplete reporting rather than a true
long-term decline.
B3) Which films were the most profitable, and what were their
co-genres?
roi_leaders <- movies_fin %>%
mutate(
cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror")),
n_cogenres = purrr::map_int(cogenres, length),
cogenres_txt = purrr::map_chr(cogenres, ~ ifelse(length(.x) == 0, "None", paste(.x, collapse = ", "))),
profit = revenue - budget,
title_wrapped = stringr::str_wrap(title, width = 28),
tooltip = paste0(
"<b>", title, "</b> (", year, ")<br>",
"<b>ROI:</b> ", round(roi, 1), "x<br>",
"<b>Profit:</b> ", scales::dollar(profit), "<br>",
"<b>Budget:</b> ", scales::dollar(budget), " | <b>Revenue:</b> ", scales::dollar(revenue), "<br>",
"<b>#Co-genres:</b> ", n_cogenres, "<br>",
"<b>Co-genres:</b> ", cogenres_txt, "<br>",
"<b>Rating:</b> ", vote_average, " (n=", vote_count, ")"
)
) %>%
arrange(desc(roi)) %>%
slice_head(n = 25)
p_roi <- ggplot(roi_leaders, aes(x = reorder(title_wrapped, roi), y = roi)) +
geom_segment(
aes(xend = title_wrapped, y = 1, yend = roi),
linewidth = 0.7, alpha = 0.25
) +
geom_point_interactive(
aes(
color = n_cogenres,
size = profit,
tooltip = tooltip,
data_id = id
),
alpha = 0.95
) +
coord_flip(clip = "off") +
scale_y_log10(
breaks = c(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
labels = scales::label_number()
) +
scale_color_viridis_c(option = "D", end = 0.95) +
scale_size_continuous(range = c(2.5, 8), labels = scales::dollar) +
labs(
title = "RQ-B3: Most profitable horror films",
subtitle = "ROI shown on a log scale • Color = number of co-genres • Size = profit",
x = NULL,
y = "ROI = revenue / budget (log scale)",
color = "# Co-genres",
size = "Profit",
caption = "Note: very small budgets can produce extreme ROI values."
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 12),
legend.position = "right",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(10, 10, 10, 10) # was wider for the labels
)
girafe(ggobj = p_roi, width_svg = 11, height_svg = 7) %>%
girafe_options(
opts_hover(css = "stroke:#111;stroke-width:2;"),
opts_tooltip(css = "background-color:white;padding:10px;border:1px solid #999;border-radius:8px;"),
opts_zoom(max = 4)
)
# interactive table (cleaner + searchable)
DT::datatable(
roi_leaders %>%
transmute(
title, year,
roi = round(roi, 1),
profit = scales::dollar(profit),
budget = scales::dollar(budget),
revenue = scales::dollar(revenue),
rating = vote_average,
votes = vote_count,
`#co-genres` = n_cogenres,
`co-genres` = cogenres_txt
),
options = list(pageLength = 10, scrollX = TRUE)
)
Here we are checking which horror films are the most
profitable, using ROI (Return on
Investment):
[ = ]
So an ROI of 100 means the movie earned 100×
its budget.
- x-axis = ROI (log scale) → moving
right means much higher profitability.
- y-axis = movie titles (top = highest ROI in this
selection).
- Color = number of co-genres (how
many genres appear alongside Horror).
- Bubble size = absolute profit
(revenue − budget), so big bubbles mean huge dollars earned,
not just high ROI.
What we see
The most extreme ROI values are at the far
right, and the top standouts are:
- One Cut of the Dead
- The Blair Witch Project
- Bear Lake Bloodbath
- Routes
- Paranormal Activity
These films achieved massive ROI, usually because
they had very small budgets but earned surprisingly
high revenue.
Paranormal Activity and The Blair Witch
Project stand out as classics of the “micro-budget →
massive payoff” model. Their bubbles are also relatively large,
meaning they are not only high ROI, but also produced large
total profit.
Many other well-known profitable films appear in the strong-ROI
range (still far above 1), such as:
- Halloween
- Night of the Living Dead
- Friday the 13th
- The Texas Chain Saw Massacre
- The Hills Have Eyes
These are films that became major successes relative to
cost, reinforcing the idea that horror often generates big
returns without blockbuster budgets.
Looking at co-genres (colors):
- Most top-ROI films are in the darker colors,
meaning they have few co-genres (often 1–2).
- A few titles show lighter colors (more co-genres), but they are less
common among the very top ROI points.
→ This suggests that the biggest ROI hits are often simple,
focused horror concepts (e.g., horror + thriller/mystery)
rather than heavily blended multi-genre films.
Overall: the most profitable films (by ROI) are
mostly low-budget horror movies that became breakout
hits. Their co-genre counts are usually small,
meaning extreme profitability often comes from straightforward
horror premises rather than highly hybrid genre mixes. Also
important: because ROI uses budget in the denominator,
very small budgets can create extreme ROI values, which is why
the plot warns that micro-budgets can produce “crazy” ROI numbers.
Theme C — Audience reception
C1) Do higher budgets buy higher ratings?
library(dplyr)
library(ggplot2)
library(scales)
library(viridis)
library(dplyr)
library(ggplot2)
library(scales)
rate_df <- movies_fin %>%
filter(!is.na(vote_average), vote_count >= 50, !is.na(budget), budget > 0)
p_budget_rating_adv2 <- ggplot(rate_df, aes(x = budget, y = vote_average)) +
stat_density_2d(aes(fill = after_stat(level)), geom = "polygon",
contour = TRUE, alpha = 0.35, color = NA) +
geom_point(alpha = 0.08, size = 0.7) +
geom_smooth(method = "loess", se = TRUE, linewidth = 1.2, color = "#2F5597") +
scale_fill_viridis_c(option = "C", guide = "none") +
scale_x_log10(labels = label_dollar(scale_cut = cut_short_scale())) +
scale_y_continuous(limits = c(0, 10), breaks = 0:10) +
labs(
title = "RQ-C1: Budget vs rating",
subtitle = "Density shading reveals concentration; smoother includes uncertainty band.\nFiltered to movies with ≥ 50 votes.",
x = "Budget (log scale)", y = "User rating (0–10)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 11, color = "grey30"),
panel.grid.minor = element_blank()
)
p_budget_rating_adv2
Here we are checking whether higher budgets buy higher
ratings — meaning: do expensive horror movies get better
user scores, or is budget not strongly related to quality (as viewers
rate it)?
The x-axis is budget (log scale),
so moving right means much larger budgets (each step is a big
multiplier). The y-axis is the user rating
(0–10). (Also, the plot is filtered to movies with ≥ 50
votes, so ratings are more reliable.)
- Gray dots = individual movies.
- Color density blob = where most movies are
concentrated (brighter/warmer = more movies there).
- Blue curve = smooth overall relationship between
budget and rating.
- Gray ribbon around the curve = uncertainty band
(wider = less certainty, usually fewer movies in that range).
What we see
The strongest concentration of movies is in the
mid-to-high budget range (roughly $1M to
$100M) with ratings clustered around ~5.5 to
~6.5. → Most horror films in the dataset sit in a “typical
zone”: moderate budgets and average-to-decent
ratings.
The blue trend line increases slightly as budget
grows, especially from very small budgets up to moderate budgets. → This
suggests a weak positive relationship: bigger budgets
are associated with slightly higher ratings.
However, the curve flattens for larger budgets
(it doesn’t keep climbing sharply). → After a certain point, spending
more money doesn’t guarantee much improvement in ratings.
The uncertainty band is widest on the far left
(very low budgets). → That region likely has fewer
films and/or noisier data, so we are less confident about the
exact trend there.
Even at high budgets, there are still plenty of movies with
only average ratings. → High budget does
not prevent a film from being rated mediocre.
Overall: higher budgets are linked to
slightly better ratings, but the effect is
small. Most horror films—cheap or expensive—end up
rated around 6-ish, and after moderate budgets the
benefit seems to plateau. In other words, money helps a
bit, but it doesn’t “buy” great ratings reliably.
C2) Profitability vs ratings (box + jitter)
library(dplyr)
library(ggplot2)
library(scales)
grp_stats <- rate_df %>%
filter(!is.na(profit_flag), !is.na(vote_average)) %>%
group_by(profit_flag) %>%
summarise(
n = n(),
med = median(vote_average, na.rm = TRUE),
.groups = "drop"
)
# defining the order explicitly (original factor order if it exists; otherwise current order)
lvl <- grp_stats$profit_flag
rate_df2 <- rate_df %>%
left_join(grp_stats, by = "profit_flag") %>%
mutate(
profit_flag_lab = factor(
paste0(profit_flag, "\n(n=", n, ")"),
levels = paste0(lvl, "\n(n=", grp_stats$n[match(lvl, grp_stats$profit_flag)], ")")
)
)
p_profit_rating_adv <- ggplot(rate_df2, aes(x = profit_flag_lab, y = vote_average, fill = profit_flag)) +
geom_violin(alpha = 0.18, width = 0.9, color = NA, trim = TRUE) +
geom_boxplot(outlier.shape = NA, width = 0.45, linewidth = 0.8, alpha = 0.75) +
geom_jitter(width = 0.16, alpha = 0.12, size = 1.1, color = "grey20") +
geom_text(
data = grp_stats %>% mutate(profit_flag_lab = paste0(profit_flag, "\n(n=", n, ")")),
aes(x = profit_flag_lab, y = med, label = round(med, 2)),
inherit.aes = FALSE,
vjust = -0.9,
size = 3.4,
fontface = "bold",
color = "grey20"
) +
scale_fill_manual(
values = c(
"Loss" = "#B07AA1",
"Low profit" = "#4E79A7",
"Hit" = "#59A14F",
"Sleeper/Blockbuster" = "#E15759"
),
guide = "none"
) +
scale_y_continuous(limits = c(0, 10), breaks = 0:10, expand = expansion(mult = c(0.02, 0.08))) +
labs(
title = "RQ-C2: Do profitable movies get higher ratings?",
subtitle = "Box + jitter, enhanced with violin density, medians, and sample sizes",
x = NULL, y = "User rating (0–10)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 11, color = "grey30"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
p_profit_rating_adv
Here we are checking whether profitability is associated with
higher ratings — meaning: do horror movies that make more
money tend to be rated better by users?
The y-axis is the user rating
(0–10), and movies are grouped by profit
category on the x-axis.
- Violin shape = rating density (where ratings are
most common in that group).
- Box plot = middle 50% of ratings (IQR) with the
median line.
- Gray dots = individual movies (jittered so we can
see overlap).
- Number on each box = median rating
for that group.
- (n= …) under each group = sample size (how many
movies in that category).
What we see
Loss (n=257) has the lowest median
rating, around 5.9. → Movies that lose money
are, on average, rated slightly lower.
Low profit (n=138) has a median around
6.0. → A small improvement, but still very close to the
loss group.
Hit (n=265) rises to about 6.1
median rating. → Profitable movies tend to score a bit better, but the
difference is still modest.
Sleeper/Blockbuster (n=221) has the
highest median, around 6.3. → The
strongest money-makers are also rated a little higher on
average.
Even though medians increase across groups, the violins
and boxes overlap a lot. → That means there are many
well-rated movies that weren’t profitable, and many
profitable movies with only average ratings.
Overall: profitability is linked to slightly
higher ratings, but the effect is not
dramatic. The median rating climbs from ~5.9
(loss) to ~6.3 (sleeper/blockbuster), yet the
heavy overlap shows that ratings alone don’t strongly predict
financial success (and success doesn’t guarantee great ratings
either).
C3) Runtime “sweet spot” (ratings vs runtime, plus a binned
heatmap)
library(dplyr)
library(ggplot2)
library(scales)
library(viridis)
rt <- movies %>%
filter(!is.na(runtime), !is.na(vote_average), vote_count >= 50,
runtime >= 40, runtime <= 220)
# binned mean rating by runtime bins
bin_w <- 5
rt_bins <- rt %>%
mutate(rt_bin = floor(runtime / bin_w) * bin_w) %>%
group_by(rt_bin) %>%
summarise(
mean_rating = mean(vote_average),
n = n(),
.groups = "drop"
) %>%
filter(n >= 20) # avoid tiny bins
# find peak bin (sweet spot estimate)
peak <- rt_bins %>% slice_max(mean_rating, n = 1)
p_rt_adv <- ggplot(rt, aes(x = runtime, y = vote_average)) +
# density background (where points concentrate)
stat_density_2d(aes(fill = after_stat(level)), geom = "polygon",
contour = TRUE, alpha = 0.30, color = NA) +
scale_fill_viridis_c(option = "C", guide = "none") +
# faint points for context
geom_point(alpha = 0.06, size = 0.7) +
# LOESS trend + CI (trustworthy)
geom_smooth(method = "loess", se = TRUE, linewidth = 1.4,
color = "#2F5597", fill = "#2F5597", alpha = 0.12) +
# binned mean line (stable signal)
geom_line(data = rt_bins, aes(x = rt_bin + bin_w/2, y = mean_rating),
linewidth = 1.2, color = "#F28E2B") +
geom_point(data = rt_bins, aes(x = rt_bin + bin_w/2, y = mean_rating, size = n),
color = "#F28E2B", alpha = 0.85) +
scale_size_continuous(range = c(1.5, 5.5), guide = guide_legend(title = "Films/bin")) +
# sweet spot annotation
geom_vline(xintercept = peak$rt_bin + bin_w/2, linetype = "dashed", alpha = 0.6) +
annotate("label",
x = peak$rt_bin + bin_w/2,
y = peak$mean_rating + 0.35,
label = paste0("Sweet spot ~ ", peak$rt_bin + bin_w/2, " min\n(mean=", round(peak$mean_rating, 2), ")"),
label.size = 0.2, size = 3.6) +
scale_y_continuous(limits = c(0, 10), breaks = 0:10) +
labs(
title = "RQ-C3: Is there a runtime sweet spot?",
subtitle = "Blue: LOESS (± CI) • Orange: binned mean ratings (5-min bins) • Density shows concentration (≥ 50 votes)",
x = "Runtime (minutes)", y = "User rating (0–10)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 11, color = "grey30"),
panel.grid.minor = element_blank()
)
p_rt_adv
Here we are checking whether horror movies have a runtime “sweet
spot” — meaning: is there a movie length where ratings tend
to be highest?
The x-axis is runtime (minutes) and
the y-axis is user rating (0–10).
(Also, the plot uses movies with ≥ 50 votes, so ratings
are more stable.)
- Gray dots = individual movies.
- Purple density shading = where most movies cluster
(brighter = more movies there).
- Blue line = LOESS smooth trend (overall
relationship) with a confidence band.
- Orange points/line = binned mean
ratings in 5-minute bins (averages by runtime
groups).
- Orange point size = number of films in that runtime
bin (bigger = more films).
- Vertical dashed line + label = estimated “sweet
spot” runtime.
What we see
The strongest concentration of movies is around ~80–110
minutes, with ratings mostly around ~5.5–6.5.
→ This is the “typical horror runtime zone” in the dataset.
The orange binned means rise gradually as
runtime increases from about ~75 minutes up to ~125
minutes. → Longer films (up to a point) tend to be rated
slightly higher on average.
The plot highlights a sweet spot around ~127.5
minutes, with an average rating around ~6.79.
→ In this dataset, movies near ~2 hours (a bit over) get the
best average ratings.
After about ~130 minutes, the blue
smooth line flattens (and even slightly dips at the far right).
→ Past the sweet spot, making a film even longer does
not clearly improve ratings.
The confidence band widens at very short and
very long runtimes. → There are fewer movies in those extremes, so the
estimate is less certain there.
Overall: horror movies show a mild runtime effect:
ratings tend to improve from short runtimes toward about
~120–130 minutes, where the plot suggests a
sweet spot (~127.5 min). After that, the benefit
levels off, meaning very long runtimes don’t
consistently earn higher ratings.
2D binned mean rating: runtime × budget
rt2 <- movies_fin %>%
filter(!is.na(runtime), !is.na(vote_average), vote_count >= 50,
runtime >= 40, runtime <= 220, budget >= 1e5)
# Making bins manually (more controllable than stat_summary_2d)
bins <- rt2 %>%
mutate(
rt_bin = cut(runtime, breaks = seq(40, 220, by = 10), include.lowest = TRUE),
b_bin = cut(log10(budget), breaks = seq(5, 9, by = 0.25), include.lowest = TRUE)
) %>%
group_by(rt_bin, b_bin) %>%
summarise(
mean_rating = mean(vote_average, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
filter(n >= 15)
ggplot(bins, aes(x = b_bin, y = rt_bin, fill = mean_rating)) +
geom_tile(color = "white", linewidth = 0.2) +
scale_fill_viridis_c(option = "C", limits = c(3, 8), oob = squish) +
labs(
title = "Runtime × budget grid: where do ratings concentrate?",
subtitle = "Tiles show mean rating (only cells with at least 15 films)",
x = "Budget bin (log10 USD)", y = "Runtime bin (minutes)", fill = "Mean rating"
) +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

This heatmap combines runtime and budget together to
see where ratings are stronger.
- X-axis: budget bins (log10 USD) → moving right
means higher budget
- Y-axis: runtime bins (minutes)
- Color: mean (average) rating in
that cell
- Only cells with at least 15 films are shown, so the
averages are not from very small samples.
What we see
1) Ratings are not highest in the “typical” short runtime area
For 80–90 minutes (bottom row), the colors are more
purple/red, meaning ratings are mostly around the mid
level (about ~6). So the most common runtime is not
automatically the best-rated.
2) The best-rated area is longer runtime + mid/high budget
The most yellow/orange tiles are in the 110–120
minute row (top) and middle-to-high budget
bins (right side). This suggests that movies around
110–120 minutes, with bigger budgets,
tend to have higher average ratings.
3) Medium runtimes (90–110) are stable but not “top”
For the 90–100 and 100–110 runtime
bins, ratings are mostly in the middle colors. This means those runtimes
give consistent but not the highest average
ratings.
4) Budget alone still doesn’t guarantee high rating
Even in higher budget bins, some tiles are not very bright. So money
helps sometimes, but ratings still depend on other things like
story, direction, acting, and originality.
Overall,this heatmap shows that the highest mean ratings appear when
runtime is around 110–120 minutes and budget is
medium to high. Shorter movies (80–90 minutes) are
usually rated more average. However, budget alone does not guarantee
high rating, because even expensive movies can still get normal
ratings.”
C4) Which co-genres are associated with higher ratings?
(descriptive)
library(plotly)
rating_by_cogenre <- movies %>%
filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>% # reliability filter
mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
select(id, title, year, vote_average, vote_count, cogenres) %>%
tidyr::unnest(cogenres) %>%
filter(!is.na(cogenres), cogenres != "") %>%
count(cogenres, wt = vote_count, name = "weighted_n") %>%
inner_join(
movies %>%
filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>%
mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
select(vote_average, vote_count, cogenres) %>%
tidyr::unnest(cogenres) %>%
filter(!is.na(cogenres), cogenres != "") %>%
group_by(cogenres) %>%
summarise(
mean_rating = mean(vote_average),
se = sd(vote_average) / sqrt(n()),
n_movies = n(),
.groups = "drop"
),
by = "cogenres"
) %>%
filter(n_movies >= 80) %>%
arrange(desc(mean_rating)) %>%
slice_head(n = 12)
plot_ly(
rating_by_cogenre,
x = ~reorder(cogenres, mean_rating),
y = ~mean_rating,
type = "bar",
text = ~paste0("Mean: ", round(mean_rating,2), "<br>n movies: ", n_movies),
hoverinfo = "text",
error_y = list(type = "data", array = ~1.96 * se, visible = TRUE)
) %>%
layout(
title = "RQ-C4: Which co-genres tend to have higher ratings?",
xaxis = list(title = "", tickangle = -30),
yaxis = list(title = "Mean vote_average (vote_count ≥ 50)"),
margin = list(b = 120)
)
library(dplyr)
library(tidyr)
library(plotly)
library(forcats)
rating_by_cogenre <- movies %>%
filter(!is.na(vote_average), !is.na(vote_count), vote_count >= 50) %>%
mutate(cogenres = purrr::map(genre_list, ~ setdiff(.x, "Horror"))) %>%
select(vote_average, vote_count, cogenres) %>%
tidyr::unnest(cogenres) %>%
filter(!is.na(cogenres), cogenres != "") %>%
group_by(cogenres) %>%
summarise(
n_movies = n(),
mean_rating = mean(vote_average),
se = sd(vote_average) / sqrt(n_movies),
ci = 1.96 * se,
.groups = "drop"
) %>%
filter(n_movies >= 80) %>%
arrange(desc(mean_rating)) %>%
slice_head(n = 12) %>%
mutate(
cogenres = fct_reorder(cogenres, mean_rating),
ci_low = mean_rating - ci,
ci_high = mean_rating + ci,
hover_txt = paste0(
"<b>", cogenres, "</b><br>",
"Mean rating: ", sprintf("%.2f", mean_rating), "<br>",
"95% CI: [", sprintf("%.2f", ci_low), ", ", sprintf("%.2f", ci_high), "]<br>",
"Movies: ", n_movies,
"<extra></extra>"
)
)
x_max <- max(rating_by_cogenre$ci_high, na.rm = TRUE)
plot_ly(
rating_by_cogenre,
x = ~mean_rating,
y = ~cogenres,
type = "bar",
orientation = "h",
marker = list(
color = ~mean_rating,
colorscale = "Viridis",
showscale = TRUE,
line = list(color = "rgba(0,0,0,0.25)", width = 1)
),
error_x = list(type = "data", array = ~ci, visible = TRUE),
hovertext = ~hover_txt,
hovertemplate = "%{hovertext}"
) %>%
layout(
title = list(text = "RQ-C4: Which co-genres tend to have higher ratings?", x = 0.02),
xaxis = list(
title = "Mean vote_average (vote_count ≥ 50)",
range = c(0, x_max + 0.25)
),
yaxis = list(title = ""),
margin = list(l = 140, r = 60, t = 60, b = 60)
)
Here we are checking which co-genres tend to have higher
ratings — meaning: when Horror is combined with another
genre (Drama, Mystery, Thriller, etc.), which combinations are rated
best by users?
The x-axis is the average user
rating, and each bar is one co-genre.
- Bar length = average rating for horror movies that
include that co-genre.
- Bar color = also reflects the rating (brighter =
higher).
- Blue error bars = uncertainty around the mean
(small = more stable estimate, large = more variability / fewer
films).
What we see
The highest-rated co-genre combination here is
Horror + Drama, with an average rating a bit above
6.1. → Horror films with strong dramatic elements tend
to be rated slightly better.
Next, Fantasy, Crime, and
Mystery also sit near the top (around
~6.0–6.1). → These mixes often add story depth,
plot structure, or investigation, which may boost audience
satisfaction.
Comedy and Thriller are in the
middle range (around ~5.8–5.9). →
These are very common horror pairings, but they don’t stand out as the
highest-rated on average.
The lowest-rated co-genres in this list are
Action and Science Fiction (closer to
~5.6–5.7). → Horror mixed with action/sci-fi appears
slightly less liked on average in this dataset.
The differences are not huge overall (roughly a
0.4–0.5 rating spread from bottom to top). → So
co-genre matters a bit, but it’s not a dramatic effect.
Overall: the co-genres most associated with higher
ratings are Drama, Fantasy, Crime, and Mystery, while
Action and Science Fiction trend
lower. Still, the gaps are fairly small, meaning co-genre influences
ratings only moderately, not massively.
Interactive highlight (Plotly)
This interactive plot lets the viewer hover to see movie titles and
key metrics.
fin_small <- movies_fin %>%
filter(!is.na(vote_average), vote_count >= 200) %>%
mutate(
tooltip = paste0(
"<b>", title, "</b>",
"<br>Year: ", year,
"<br>Rating: ", round(vote_average, 1), " (", comma(vote_count), " votes)",
"<br>Budget: $", comma(round(budget)),
"<br>Revenue: $", comma(round(revenue)),
"<br>ROI: ", round(roi, 2), "x"
)
)
library(viridisLite)
plotly::plot_ly(
data = fin_small,
x = ~budget, y = ~revenue,
type = "scatter", mode = "markers",
text = ~tooltip, hoverinfo = "text",
marker = list(size = 7, opacity = 0.65),
color = ~vote_average,
colors = viridisLite::viridis(256)
) %>%
layout(
title = "Interactive: Budget vs Revenue (hover for details)",
xaxis = list(title = "Budget (log)", type = "log"),
yaxis = list(title = "Revenue (log)", type = "log")
)
This interactive Plotly chart is the same Budget vs Revenue
idea, but now the viewer can hover on any
point to see the movie’s details (title, year, rating + vote count,
budget, revenue, ROI). This makes the analysis more “real” because we
can move from patterns to specific examples.
What we see
1) Clear positive pattern (but not perfect)
Most points follow an upward direction: higher budget usually
connects to higher revenue. So spending more money often gives
more earning potential.
But the points are still spread out, so budget is not a
guarantee.
2) Many movies lose money even with decent budgets
When a point has revenue lower than budget, the ROI
will be below 1 (like the example shown: ROI 0.57x).
This means some movies with millions of dollars budget still
don’t make it back.
3) We can quickly find outliers (big wins and big failures)
With hover, we can identify:
- Sleeper hits (low budget but high revenue, huge
ROI)
- Over-budget failures (high budget but low revenue)
This is harder to do with a static plot.
Why this interactive plot is useful
This plot helps us explore the relationship between money and
success, but also lets us see the exact movies behind extreme points. It
supports our story that horror has both: blockbuster-style hits and
low-budget surprise wins.”
Overal,The interactive plot shows budget and revenue are generally
related, but there are many exceptions. By hovering, we can find movies
that are surprise hits (high ROI) and movies that fail to earn back
their budget. Also, higher rating does not always mean higher
revenue.”
Conclusion
In this project we used the Horror Movies dataset to understand the
“story behind horror” from three sides: how the genre changed
over time, how money works in horror, and
how audiences react.
First, horror clearly became much bigger over the years. The number
of releases grows slowly in early decades, but after the 2000s
(especially late 2010s) it increases very fast. At the same time,
co-genre patterns show that horror keeps mixing with other styles.
Thriller stays an important partner genre, while some older combinations
(like horror + sci-fi) become less common. Runtime also stays quite
stable: most horror movies are around ~90 minutes, but
in recent decades we see more longer movies too.
Second, the financial analysis shows that horror is a genre where
low-budget films can win big. Budget and revenue are
related, but the scatterplots show many exceptions—some high-budget
movies still fail, and some small movies become huge hits. The ROI table
confirms this strongly: several famous horror movies earned many times
their budget, meaning horror can be very profitable even without
blockbuster spending. Over time, typical (median) budgets and revenues
do not increase smoothly, and recent years can be affected by missing
data or industry changes.
Third, audience reception results suggest that money does not
buy high ratings. Bigger budgets only slightly improve ratings
at low levels, but after medium budgets the ratings mostly stay around
the same level. Profitable movies have a little higher ratings on
average, but there is a lot of overlap, so financial success and
audience satisfaction are not the same thing. For runtime, we found a
small “sweet spot”: longer movies (around 110–160
minutes) can have slightly higher average ratings, and the
heatmap shows that the best mean ratings often appear in the
longer runtime + medium/high budget area.
Overall, our results tell one clear story: horror is a
flexible genre that grows fast, experiments with styles, and can
generate surprising profits, but audience ratings depend more
on creative choices than just budget.
7) Reproducibility
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] visNetwork_2.1.2 viridis_0.6.5 viridisLite_0.4.2 DT_0.33
## [5] ggiraph_0.9.2 ggrepel_0.9.6 patchwork_1.3.2 plotly_4.11.0
## [9] ggExtra_0.11.0 ggridges_0.5.7 scales_1.4.0 lubridate_1.9.3
## [13] forcats_1.0.0 stringr_1.6.0 dplyr_1.1.4 purrr_1.2.0
## [17] readr_2.1.5 tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.1
## [21] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.54 bslib_0.9.0
## [4] htmlwidgets_1.6.4 lattice_0.22-6 tzdb_0.4.0
## [7] crosstalk_1.2.2 vctrs_0.6.5 tools_4.4.2
## [10] generics_0.1.4 parallel_4.4.2 pkgconfig_2.0.3
## [13] Matrix_1.7-1 data.table_1.17.8 RColorBrewer_1.1-3
## [16] S7_0.2.0 lifecycle_1.0.4 compiler_4.4.2
## [19] farver_2.1.2 httpuv_1.6.15 fontquiver_0.2.1
## [22] fontLiberation_0.1.0 htmltools_0.5.8.1 sass_0.4.10
## [25] yaml_2.3.10 lazyeval_0.2.2 crayon_1.5.3
## [28] later_1.4.4 pillar_1.11.1 jquerylib_0.1.4
## [31] MASS_7.3-61 cachem_1.1.0 nlme_3.1-166
## [34] mime_0.13 fontBitstreamVera_0.1.1 tidyselect_1.2.1
## [37] digest_0.6.38 stringi_1.8.7 labeling_0.4.3
## [40] splines_4.4.2 fastmap_1.2.0 grid_4.4.2
## [43] cli_3.6.5 magrittr_2.0.4 utf8_1.2.6
## [46] withr_3.0.2 gdtools_0.4.4 promises_1.5.0
## [49] bit64_4.5.2 timechange_0.3.0 rmarkdown_2.30
## [52] httr_1.4.7 igraph_2.1.2 bit_4.5.0
## [55] otel_0.2.0 gridExtra_2.3 hms_1.1.3
## [58] shiny_1.9.1 evaluate_1.0.5 knitr_1.50
## [61] miniUI_0.1.1.1 mgcv_1.9-1 rlang_1.1.6
## [64] isoband_0.2.7 Rcpp_1.1.0 xtable_1.8-4
## [67] glue_1.8.0 vroom_1.6.5 rstudioapi_0.17.1
## [70] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1