read in data
movies <- read.csv("movies_final.csv")
head(movies)
colnames(movies)
## [1] "genres" "popularity"
## [3] "production_company_main" "release_month"
## [5] "release_day" "release_year"
## [7] "runtime" "title"
## [9] "vote_average" "vote_count"
## [11] "director" "metascore"
## [13] "oscar_nominations" "oscar_wins"
## [15] "filtered_overview" "actor1"
## [17] "actor2" "actor3"
## [19] "actor4" "actor5"
## [21] "domestic_revenue" "total_revenue"
## [23] "estimated_budget" "profit"
checking for multicolinearity
cor_matrix <- cor(movies[,c("estimated_budget","popularity","total_revenue","domestic_revenue","runtime","vote_average","metascore","oscar_nominations","oscar_wins")])
cor_matrix
## estimated_budget popularity total_revenue domestic_revenue
## estimated_budget 1.00000000 0.3751724 0.6681704 0.5732470
## popularity 0.37517239 1.0000000 0.5592194 0.5160689
## total_revenue 0.66817043 0.5592194 1.0000000 0.9300320
## domestic_revenue 0.57324705 0.5160689 0.9300320 1.0000000
## runtime 0.22130186 0.1711806 0.2207753 0.2063764
## vote_average -0.01781197 0.3144203 0.2168878 0.2510961
## metascore -0.01926886 0.2228885 0.1908529 0.2362679
## oscar_nominations -0.01070569 0.1511423 0.1868755 0.2176804
## oscar_wins -0.02511617 0.1178755 0.1730198 0.1873358
## runtime vote_average metascore oscar_nominations
## estimated_budget 0.2213019 -0.01781197 -0.01926886 -0.01070569
## popularity 0.1711806 0.31442032 0.22288853 0.15114230
## total_revenue 0.2207753 0.21688776 0.19085289 0.18687550
## domestic_revenue 0.2063764 0.25109614 0.23626787 0.21768044
## runtime 1.0000000 0.43822659 0.33638179 0.46422390
## vote_average 0.4382266 1.00000000 0.72253190 0.45833258
## metascore 0.3363818 0.72253190 1.00000000 0.51129869
## oscar_nominations 0.4642239 0.45833258 0.51129869 1.00000000
## oscar_wins 0.3728642 0.33373347 0.36538182 0.80591048
## oscar_wins
## estimated_budget -0.02511617
## popularity 0.11787549
## total_revenue 0.17301980
## domestic_revenue 0.18733580
## runtime 0.37286417
## vote_average 0.33373347
## metascore 0.36538182
## oscar_nominations 0.80591048
## oscar_wins 1.00000000
exploratory data analysis
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(trelliscopejs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
library(treemap)
dive into distributions of some numeric variables
summary(movies$total_revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.523e+06 4.354e+07 9.088e+07 1.613e+08 1.937e+08 2.924e+09
summary(movies$estimated_budget)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15000 18000000 35000000 49790011 66000000 380000000
summary(movies$release_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1937 1999 2005 2003 2011 2016
minimum and maximum revenues
which(movies$total_revenue == max(movies$total_revenue))
## [1] 1
movies[1, c("total_revenue","title")]
which(movies$total_revenue == min(movies$total_revenue))
## [1] 1560
movies[1560, c("total_revenue","title")]
success of most common directors and actors
# pull out top 25 directors and actors
directors <- table(movies$director)
top25_directors <- names(sort(directors, decreasing = TRUE)[1:25])
actors <- table(movies$actor1)
top25_actors <- names(sort(actors, decreasing = TRUE)[1:25])
movies %>%
filter(director %in% top25_directors) %>%
group_by(director) %>%
summarise(movie_count = n(),
avg_score = mean(metascore),
total_revenue_generated = mean(total_revenue) / 1000000) %>%
ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) +
geom_point(col = "gray", bg = "lightyellow1", pch = 21) +
geom_text_repel(aes(label = director), size = 2, vjust = 0, max.overlaps = 25) +
xlim(c(5, 30)) +
ylim(c(35, 85)) +
labs(x = "Movie Count",
y = "Average Metascore",
title = "Top 25 Most Common Directors and their Success",
size = "Average Movie\nRevenue (millions)") +
theme(plot.title = element_text(hjust = 0.5))
movies %>%
filter(actor1 %in% top25_actors) %>%
group_by(actor1) %>%
summarise(movie_count = n(),
avg_score = mean(metascore),
total_revenue_generated = mean(total_revenue) / 1000000) %>%
ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) +
geom_point(col = "gray", bg = "lightyellow1", pch = 21) +
geom_text_repel(aes(label = actor1), size = 2, vjust = 0, max.overlaps = 25) +
xlim(c(10, 30)) +
ylim(c(40, 70)) +
labs(x = "Movie Count",
y = "Average Metascore",
title = "Top 25 Most Common Actors and their Success",
size = "Average Movie\nRevenue (millions)") +
theme(plot.title = element_text(hjust = 0.5))
rank these actors and directors by metascore
actors_ranked <- movies %>%
filter(actor1 %in% top25_actors) %>%
group_by(actor1) %>%
summarise(avg_score = mean(metascore)) %>%
arrange(desc(avg_score))
actors_ranked
directors_ranked <- movies %>%
filter(director %in% top25_directors) %>%
group_by(director) %>%
summarise(avg_score = mean(metascore)) %>%
arrange(desc(avg_score))
directors_ranked
rank them by average revenue
actors_ranked <- movies %>%
filter(actor1 %in% top25_actors) %>%
group_by(actor1) %>%
summarise(avg_revenue = mean(total_revenue)) %>%
arrange(desc(avg_revenue))
actors_ranked
directors_ranked <- movies %>%
filter(director %in% top25_directors) %>%
group_by(director) %>%
summarise(avg_revenue = mean(total_revenue)) %>%
arrange(desc(avg_revenue))
directors_ranked
look at standout directors/actors
movies[movies$director == "Martin Scorsese",]
movies[movies$director == "Steven Spielberg",]
movies[movies$actor1 == "Leonardo DiCaprio",]
movies[movies$actor1 == "Robert De Niro",]
movies[movies$actor1 == "Will Smith",]
top 10 most profitable movies across the world
movies %>%
arrange(desc(total_revenue)) %>%
top_n(10, total_revenue) %>%
ggplot(aes(x = estimated_budget/1000000, y = total_revenue/1000000)) +
ylim(c(1000,3000)) +
xlim(c(50,300)) +
geom_point(col = "royalblue") +
geom_smooth(col = "cadetblue3", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
labs(x = "Estimated Budget (millions)", y = "Revenue (millions)", title = "Top 10 Profitable Movies") +
theme(plot.title = element_text(hjust = 0.5))
top 10 most profitable movies in the US
movies %>%
arrange(desc(domestic_revenue)) %>%
top_n(10, domestic_revenue) %>%
ggplot(aes(x = estimated_budget/1000000, y = domestic_revenue/1000000)) +
ylim(c(300,800)) +
xlim(c(0,300)) +
geom_point(col = "mediumvioletred") +
geom_smooth(col = "plum1", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2) +
labs(x = "Estimated Budget (millions)", y = "Domestic Revenue (millions)", title = "Top 10 Profitable Movies Domestically") +
theme(plot.title = element_text(hjust = 0.5))
distributions
# release years
ggplot(movies, aes(release_year)) +
geom_bar(fill = "darkseagreen1", col = "gray") +
labs(x = "Release Year", y = "Movie Count", title = "Histogram of Movie Releases")
# pull out top 10 production companies
prod <- table(movies$production_company_main)
top10 <- names(sort(prod, decreasing = TRUE)[1:10])
# production companies
movies %>%
filter(production_company_main %in% top10) %>%
ggplot(aes(production_company_main)) +
geom_bar(fill = "darkseagreen1", col = "gray") +
labs(x = "Top 10 Production Companies", y = "Movie Count", title = "Histogram of Production Companies") +
theme(axis.text.x = element_text(angle = 15, hjust = 1, size = 5))
# create new dummy variables for genres
movies$action <- ifelse(grepl("Action", movies$genres), 1, 0)
movies$romance <- ifelse(grepl("Romance", movies$genres), 1, 0)
movies$drama <- ifelse(grepl("Drama", movies$genres), 1, 0)
movies$comedy <- ifelse(grepl("Comedy", movies$genres), 1, 0)
movies$adventure <- ifelse(grepl("Adventure", movies$genres), 1, 0)
movies$fantasy <- ifelse(grepl("Fantasy", movies$genres), 1, 0)
movies$scifi <- ifelse(grepl("Science Fiction", movies$genres), 1, 0)
movies$thriller <- ifelse(grepl("Thriller", movies$genres), 1, 0)
movies$horror <- ifelse(grepl("Horror", movies$genres), 1, 0)
movies$crime <- ifelse(grepl("Crime", movies$genres), 1, 0)
movies$family <- ifelse(grepl("Family", movies$genres), 1, 0)
movies$animation <- ifelse(grepl("Animation", movies$genres), 1, 0)
movies$mystery <- ifelse(grepl("Mystery", movies$genres), 1, 0)
movies$history <- ifelse(grepl("History", movies$genres), 1, 0)
movies$war <- ifelse(grepl("War", movies$genres), 1, 0)
movies$western <- ifelse(grepl("Western", movies$genres), 1, 0)
movies$music <- ifelse(grepl("Music", movies$genres), 1, 0)
movies$doc <- ifelse(grepl("Documentary", movies$genres), 1, 0)
# genres
movies %>%
pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
filter(present == 1) %>%
ggplot(aes(x = genre)) +
geom_bar(fill = "darkseagreen1", col = "gray") +
labs(x = "Genres", y = "Movie Count", title = "Histogram of Movie Genres") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))
oscars alignment with revenues
# create new dummy variable for oscar wins
movies$oscar_dummy <- ifelse(movies$oscar_wins >= 2, "Won at least 2 Oscars", "Won less than 2 Oscars")
movies %>%
filter(total_revenue < 1000000000) %>%
ggplot(aes(x = oscar_dummy, y = total_revenue/1000000)) +
geom_boxplot(fill = c("thistle1", "orchid")) +
labs(title = "Movies by Oscar/Revenue Success",
y = "Revenue (millions)") +
theme(axis.title.x = element_blank())
oscars alignment with metascores
movies %>%
ggplot(aes(x = oscar_dummy, y = metascore)) +
geom_boxplot(fill = c("paleturquoise", "turquoise4")) +
labs(title = "Movies by Oscar/Critic Success",
y = "Metascore") +
theme(axis.title.x = element_blank())
success of different production companies
# pull out top 10 production companies
prod_companies <- table(movies$production_company_main)
top10_companies <- names(sort(prod_companies, decreasing = TRUE)[1:10])
movies %>%
filter(production_company_main %in% top10_companies) %>%
group_by(production_company_main) %>%
summarise(movie_count = n(),
avg_score = mean(metascore),
total_revenue_generated = sum(total_revenue) / 1000000000) %>%
ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) +
geom_point(col = "gray", bg = "salmon", pch = 21) +
geom_text_repel(aes(label = production_company_main), size = 2, vjust = 0, max.overlaps = 25) +
labs(x = "Movie Count",
y = "Average Metascore",
title = "Top 10 Most Common Production Companies and their Success",
size = "Total Movie\nRevenue (Billions)") +
theme(plot.title = element_text(hjust = 0.5))
success of different genres
movies %>%
filter(total_revenue <= 1000000000) %>%
pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
filter(present == 1) %>%
ggplot(aes(x = genre, y = total_revenue/1000000)) +
geom_boxplot(fill = rainbow(18)) +
labs(title = "Movies by Genre vs Revenue",
x = "Genre",
y = "Revenue (millions)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))
movies %>%
pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
filter(present == 1) %>%
ggplot(aes(x = genre, y = metascore)) +
geom_boxplot(fill = rainbow(18)) +
labs(title = "Movies by Genre vs Metascore",
x = "Genre",
y = "Metascore") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))
seasonal movie success
movies %>%
filter(total_revenue <= 1000000000) %>%
mutate(
release_month = factor(release_month, levels = 1:12, labels = month.name)
) %>%
group_by(release_year, release_month) %>%
summarise(avg_revenue = mean(total_revenue, na.rm = TRUE) / 1e6, .groups = "drop") %>%
ggplot(aes(x = release_year, y = release_month, fill = avg_revenue)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightyellow", high = "slateblue") +
labs(
title = "Average Movie Revenue by Month and Year",
x = "Release Year",
y = "Month",
fill = "Average Revenue\n(millions)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
plot.title = element_text(hjust = 0.5)
)
movies %>%
group_by(release_year) %>%
summarise(avg_revenue = mean(total_revenue)) %>%
ggplot(aes(x = release_year, y = avg_revenue/1000000)) +
geom_point(col = "dodgerblue", bg = "gray", pch = 21) +
geom_line(col = "royalblue", lty = 2) +
labs(title = "Movie Revenue Over Time",
x = "Release Year",
y = "Average Movie Revenue (millions)")
franchise/sequel analysis
# create new dummy variables for movie franchises/sequels
movies$marvel <- ifelse(grepl("Marvel", movies$production_company_main), 1, 0)
movies$dc <- ifelse(grepl("DC Comics", movies$production_company_main), 1, 0)
movies$harry_potter <- ifelse(grepl("Harry Potter", movies$title), 1, 0)
movies$hunger_games <- ifelse(grepl("Hunger Games", movies$title), 1, 0)
movies$lord_of_the_rings <- ifelse(grepl("The Lord of", movies$title), 1, 0)
movies$toy_story <- ifelse(grepl("Toy Story", movies$title), 1, 0)
movies$indiana_jones <- ifelse(grepl("Indiana Jones", movies$title), 1, 0)
movies %>%
pivot_longer(cols = 44:50, names_to = "franchise_sequel", values_to = "present") %>%
filter(present == 1) %>%
treemap(
index = "franchise_sequel",
vSize = "total_revenue",
vColor = "metascore",
type = "value",
palette = "Reds",
title = "Franchise/Sequel Movie Success",
fontsize.labels = 12,
border.col = "white"
)
movies %>%
filter(marvel == 1) %>%
ggplot(aes(x = release_year, y = total_revenue/1000000)) +
geom_point(col = "gold") +
geom_smooth(method = "lm", col = "tomato", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
labs(x = "Release Year", y = "Revenue (millions)", title = "Marvel Movies") +
theme(plot.title = element_text(hjust = 0.5))
movies %>%
filter(dc == 1) %>%
ggplot(aes(x = release_year, y = total_revenue/1000000)) +
geom_point(col = "gold") +
geom_smooth(method = "lm", col = "tomato", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
labs(x = "Release Year", y = "Revenue (millions)", title = "DC Movies") +
theme(plot.title = element_text(hjust = 0.5))
movies %>%
filter(harry_potter == 1) %>%
ggplot(aes(x = release_year, y = total_revenue/1000000)) +
geom_point(col = "gold") +
geom_smooth(method = "lm", col = "tomato", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
labs(x = "Release Year", y = "Revenue (millions)", title = "Harry Potter Movies") +
theme(plot.title = element_text(hjust = 0.5))
movies %>%
filter(hunger_games == 1) %>%
ggplot(aes(x = release_year, y = total_revenue/1000000)) +
geom_point(col = "gold") +
geom_smooth(method = "lm", col = "tomato", se = FALSE) +
geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
labs(x = "Release Year", y = "Revenue (millions)", title = "Hunger Games Movies") +
theme(plot.title = element_text(hjust = 0.5))
create variable for ratio of revenue to budget “total box office revenue
is 10 times production budget”
movies$revenue_to_budget <- movies$total_revenue / movies$estimated_budget
which(movies$revenue_to_budget > 100)
## [1] 1294 1652 1658 1659 1660 1662 1663 2226 2232 2234 2236
# leave out these movies as they are outliers
# look at the max revenue : budget
which(movies$revenue_to_budget == max(movies$revenue_to_budget))
## [1] 1663
movies[1663,]
# Paranormal Activity made more than 12000 times its budget
movies %>%
filter(movies$revenue_to_budget < 100) %>%
ggplot(aes(x = metascore, y = vote_average, size = revenue_to_budget)) +
geom_point(col = "black", bg = "magenta", pch = 21)
export data to new csv
write.csv(movies, "movie_data_final.csv", row.names = FALSE)