read in data

movies <- read.csv("movies_final.csv")
head(movies)
colnames(movies)
##  [1] "genres"                  "popularity"             
##  [3] "production_company_main" "release_month"          
##  [5] "release_day"             "release_year"           
##  [7] "runtime"                 "title"                  
##  [9] "vote_average"            "vote_count"             
## [11] "director"                "metascore"              
## [13] "oscar_nominations"       "oscar_wins"             
## [15] "filtered_overview"       "actor1"                 
## [17] "actor2"                  "actor3"                 
## [19] "actor4"                  "actor5"                 
## [21] "domestic_revenue"        "total_revenue"          
## [23] "estimated_budget"        "profit"

checking for multicolinearity

cor_matrix <- cor(movies[,c("estimated_budget","popularity","total_revenue","domestic_revenue","runtime","vote_average","metascore","oscar_nominations","oscar_wins")])
cor_matrix
##                   estimated_budget popularity total_revenue domestic_revenue
## estimated_budget        1.00000000  0.3751724     0.6681704        0.5732470
## popularity              0.37517239  1.0000000     0.5592194        0.5160689
## total_revenue           0.66817043  0.5592194     1.0000000        0.9300320
## domestic_revenue        0.57324705  0.5160689     0.9300320        1.0000000
## runtime                 0.22130186  0.1711806     0.2207753        0.2063764
## vote_average           -0.01781197  0.3144203     0.2168878        0.2510961
## metascore              -0.01926886  0.2228885     0.1908529        0.2362679
## oscar_nominations      -0.01070569  0.1511423     0.1868755        0.2176804
## oscar_wins             -0.02511617  0.1178755     0.1730198        0.1873358
##                     runtime vote_average   metascore oscar_nominations
## estimated_budget  0.2213019  -0.01781197 -0.01926886       -0.01070569
## popularity        0.1711806   0.31442032  0.22288853        0.15114230
## total_revenue     0.2207753   0.21688776  0.19085289        0.18687550
## domestic_revenue  0.2063764   0.25109614  0.23626787        0.21768044
## runtime           1.0000000   0.43822659  0.33638179        0.46422390
## vote_average      0.4382266   1.00000000  0.72253190        0.45833258
## metascore         0.3363818   0.72253190  1.00000000        0.51129869
## oscar_nominations 0.4642239   0.45833258  0.51129869        1.00000000
## oscar_wins        0.3728642   0.33373347  0.36538182        0.80591048
##                    oscar_wins
## estimated_budget  -0.02511617
## popularity         0.11787549
## total_revenue      0.17301980
## domestic_revenue   0.18733580
## runtime            0.37286417
## vote_average       0.33373347
## metascore          0.36538182
## oscar_nominations  0.80591048
## oscar_wins         1.00000000

exploratory data analysis

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(trelliscopejs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
library(treemap)

dive into distributions of some numeric variables

summary(movies$total_revenue)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 2.523e+06 4.354e+07 9.088e+07 1.613e+08 1.937e+08 2.924e+09
summary(movies$estimated_budget)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     15000  18000000  35000000  49790011  66000000 380000000
summary(movies$release_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1937    1999    2005    2003    2011    2016

minimum and maximum revenues

which(movies$total_revenue == max(movies$total_revenue))
## [1] 1
movies[1, c("total_revenue","title")]
which(movies$total_revenue == min(movies$total_revenue))
## [1] 1560
movies[1560, c("total_revenue","title")]

success of most common directors and actors

# pull out top 25 directors and actors
directors <- table(movies$director)
top25_directors <- names(sort(directors, decreasing = TRUE)[1:25])
actors <- table(movies$actor1)
top25_actors <- names(sort(actors, decreasing = TRUE)[1:25])

movies %>%
  filter(director %in% top25_directors) %>%
  group_by(director) %>%
  summarise(movie_count = n(),
            avg_score = mean(metascore),
            total_revenue_generated = mean(total_revenue) / 1000000) %>%
  ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) + 
    geom_point(col = "gray", bg = "lightyellow1", pch = 21) +
    geom_text_repel(aes(label = director), size = 2, vjust = 0, max.overlaps = 25) +
    xlim(c(5, 30)) +
    ylim(c(35, 85)) + 
    labs(x = "Movie Count", 
         y = "Average Metascore", 
         title = "Top 25 Most Common Directors and their Success",
         size = "Average Movie\nRevenue (millions)") +
    theme(plot.title = element_text(hjust = 0.5))

movies %>%
  filter(actor1 %in% top25_actors) %>%
  group_by(actor1) %>%
  summarise(movie_count = n(),
            avg_score = mean(metascore),
            total_revenue_generated = mean(total_revenue) / 1000000) %>%
  ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) + 
    geom_point(col = "gray", bg = "lightyellow1", pch = 21) +
    geom_text_repel(aes(label = actor1), size = 2, vjust = 0, max.overlaps = 25) +
    xlim(c(10, 30)) +
    ylim(c(40, 70)) + 
    labs(x = "Movie Count", 
         y = "Average Metascore", 
         title = "Top 25 Most Common Actors and their Success",
         size = "Average Movie\nRevenue (millions)") +
    theme(plot.title = element_text(hjust = 0.5))

rank these actors and directors by metascore

actors_ranked <- movies %>%
  filter(actor1 %in% top25_actors) %>%
  group_by(actor1) %>%
  summarise(avg_score = mean(metascore)) %>%
  arrange(desc(avg_score))
actors_ranked
directors_ranked <- movies %>%
  filter(director %in% top25_directors) %>%
  group_by(director) %>%
  summarise(avg_score = mean(metascore)) %>%
  arrange(desc(avg_score))
directors_ranked

rank them by average revenue

actors_ranked <- movies %>%
  filter(actor1 %in% top25_actors) %>%
  group_by(actor1) %>%
  summarise(avg_revenue = mean(total_revenue)) %>%
  arrange(desc(avg_revenue))
actors_ranked
directors_ranked <- movies %>%
  filter(director %in% top25_directors) %>%
  group_by(director) %>%
  summarise(avg_revenue = mean(total_revenue)) %>%
  arrange(desc(avg_revenue))
directors_ranked

look at standout directors/actors

movies[movies$director == "Martin Scorsese",]
movies[movies$director == "Steven Spielberg",]
movies[movies$actor1 == "Leonardo DiCaprio",]
movies[movies$actor1 == "Robert De Niro",]
movies[movies$actor1 == "Will Smith",]

top 10 most profitable movies across the world

movies %>%
  arrange(desc(total_revenue)) %>%
  top_n(10, total_revenue) %>%
  ggplot(aes(x = estimated_budget/1000000, y = total_revenue/1000000)) + 
    ylim(c(1000,3000)) + 
    xlim(c(50,300)) + 
    geom_point(col = "royalblue") +
    geom_smooth(col = "cadetblue3", se = FALSE) + 
    geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
    labs(x = "Estimated Budget (millions)", y = "Revenue (millions)", title = "Top 10 Profitable Movies") +
    theme(plot.title = element_text(hjust = 0.5))

top 10 most profitable movies in the US

movies %>%
  arrange(desc(domestic_revenue)) %>%
  top_n(10, domestic_revenue) %>%
  ggplot(aes(x = estimated_budget/1000000, y = domestic_revenue/1000000)) + 
  ylim(c(300,800)) + 
  xlim(c(0,300)) + 
  geom_point(col = "mediumvioletred") +
  geom_smooth(col = "plum1", se = FALSE) + 
  geom_text(aes(label = title), vjust = 1.5, size = 2) +
  labs(x = "Estimated Budget (millions)", y = "Domestic Revenue (millions)", title = "Top 10 Profitable Movies Domestically") +
  theme(plot.title = element_text(hjust = 0.5))

distributions

# release years
ggplot(movies, aes(release_year)) +
  geom_bar(fill = "darkseagreen1", col = "gray") +
  labs(x = "Release Year", y = "Movie Count", title = "Histogram of Movie Releases") 

# pull out top 10 production companies
prod <- table(movies$production_company_main)
top10 <- names(sort(prod, decreasing = TRUE)[1:10])

# production companies
movies %>%
  filter(production_company_main %in% top10) %>%
  ggplot(aes(production_company_main)) +
    geom_bar(fill = "darkseagreen1", col = "gray") +
    labs(x = "Top 10 Production Companies", y = "Movie Count", title = "Histogram of Production Companies") +
    theme(axis.text.x = element_text(angle = 15, hjust = 1, size = 5))

# create new dummy variables for genres
movies$action <- ifelse(grepl("Action", movies$genres), 1, 0)
movies$romance <- ifelse(grepl("Romance", movies$genres), 1, 0)
movies$drama <- ifelse(grepl("Drama", movies$genres), 1, 0)
movies$comedy <- ifelse(grepl("Comedy", movies$genres), 1, 0)
movies$adventure <- ifelse(grepl("Adventure", movies$genres), 1, 0)
movies$fantasy <- ifelse(grepl("Fantasy", movies$genres), 1, 0)
movies$scifi <- ifelse(grepl("Science Fiction", movies$genres), 1, 0)
movies$thriller <- ifelse(grepl("Thriller", movies$genres), 1, 0)
movies$horror <- ifelse(grepl("Horror", movies$genres), 1, 0)
movies$crime <- ifelse(grepl("Crime", movies$genres), 1, 0)
movies$family <- ifelse(grepl("Family", movies$genres), 1, 0)
movies$animation <- ifelse(grepl("Animation", movies$genres), 1, 0)
movies$mystery <- ifelse(grepl("Mystery", movies$genres), 1, 0)
movies$history <- ifelse(grepl("History", movies$genres), 1, 0)
movies$war <- ifelse(grepl("War", movies$genres), 1, 0)
movies$western <- ifelse(grepl("Western", movies$genres), 1, 0)
movies$music <- ifelse(grepl("Music", movies$genres), 1, 0)
movies$doc <- ifelse(grepl("Documentary", movies$genres), 1, 0)

# genres
movies %>% 
  pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
  filter(present == 1) %>%
  ggplot(aes(x = genre)) +
    geom_bar(fill = "darkseagreen1", col = "gray") +
    labs(x = "Genres", y = "Movie Count", title = "Histogram of Movie Genres") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))

oscars alignment with revenues

# create new dummy variable for oscar wins
movies$oscar_dummy <- ifelse(movies$oscar_wins >= 2, "Won at least 2 Oscars", "Won less than 2 Oscars")

movies %>%
  filter(total_revenue < 1000000000) %>%
  ggplot(aes(x = oscar_dummy, y = total_revenue/1000000)) +
  geom_boxplot(fill = c("thistle1", "orchid")) +
  labs(title = "Movies by Oscar/Revenue Success",
       y = "Revenue (millions)") +
  theme(axis.title.x = element_blank())

oscars alignment with metascores

movies %>%
  ggplot(aes(x = oscar_dummy, y = metascore)) +
  geom_boxplot(fill = c("paleturquoise", "turquoise4")) +
  labs(title = "Movies by Oscar/Critic Success",
       y = "Metascore") +
  theme(axis.title.x = element_blank())

success of different production companies

# pull out top 10 production companies
prod_companies <- table(movies$production_company_main)
top10_companies <- names(sort(prod_companies, decreasing = TRUE)[1:10])

movies %>%
  filter(production_company_main %in% top10_companies) %>%
  group_by(production_company_main) %>%
  summarise(movie_count = n(),
            avg_score = mean(metascore),
            total_revenue_generated = sum(total_revenue) / 1000000000) %>%
  ggplot(aes(x = movie_count, y = avg_score, size = total_revenue_generated)) + 
    geom_point(col = "gray", bg = "salmon", pch = 21) +
    geom_text_repel(aes(label = production_company_main), size = 2, vjust = 0, max.overlaps = 25) +
    labs(x = "Movie Count", 
         y = "Average Metascore", 
         title = "Top 10 Most Common Production Companies and their Success",
         size = "Total Movie\nRevenue (Billions)") +
    theme(plot.title = element_text(hjust = 0.5))

success of different genres

movies %>%
  filter(total_revenue <= 1000000000) %>%
  pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
  filter(present == 1) %>%
  ggplot(aes(x = genre, y = total_revenue/1000000)) +
    geom_boxplot(fill = rainbow(18)) +
    labs(title = "Movies by Genre vs Revenue",
         x = "Genre",
         y = "Revenue (millions)") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))

movies %>%
  pivot_longer(cols = 25:42, names_to = "genre", values_to = "present") %>%
  filter(present == 1) %>%
  ggplot(aes(x = genre, y = metascore)) +
    geom_boxplot(fill = rainbow(18)) +
    labs(title = "Movies by Genre vs Metascore",
         x = "Genre",
         y = "Metascore") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))

seasonal movie success

movies %>%
  filter(total_revenue <= 1000000000) %>%
  mutate(
    release_month = factor(release_month, levels = 1:12, labels = month.name)
  ) %>%
  group_by(release_year, release_month) %>%
  summarise(avg_revenue = mean(total_revenue, na.rm = TRUE) / 1e6, .groups = "drop") %>%
  ggplot(aes(x = release_year, y = release_month, fill = avg_revenue)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightyellow", high = "slateblue") +
  labs(
    title = "Average Movie Revenue by Month and Year",
    x = "Release Year",
    y = "Month",
    fill = "Average Revenue\n(millions)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    plot.title = element_text(hjust = 0.5)
  )

movies %>%
  group_by(release_year) %>% 
  summarise(avg_revenue = mean(total_revenue))  %>%
  ggplot(aes(x = release_year, y = avg_revenue/1000000)) +
    geom_point(col = "dodgerblue", bg = "gray", pch = 21) +
    geom_line(col = "royalblue", lty = 2) + 
    labs(title = "Movie Revenue Over Time",
         x = "Release Year",
         y = "Average Movie Revenue (millions)")

franchise/sequel analysis

# create new dummy variables for movie franchises/sequels
movies$marvel <- ifelse(grepl("Marvel", movies$production_company_main), 1, 0)
movies$dc <- ifelse(grepl("DC Comics", movies$production_company_main), 1, 0)
movies$harry_potter <- ifelse(grepl("Harry Potter", movies$title), 1, 0)
movies$hunger_games <- ifelse(grepl("Hunger Games", movies$title), 1, 0)
movies$lord_of_the_rings <- ifelse(grepl("The Lord of", movies$title), 1, 0)
movies$toy_story <- ifelse(grepl("Toy Story", movies$title), 1, 0)
movies$indiana_jones <- ifelse(grepl("Indiana Jones", movies$title), 1, 0)

movies %>% 
  pivot_longer(cols = 44:50, names_to = "franchise_sequel", values_to = "present") %>%
  filter(present == 1) %>%
  treemap(
    index = "franchise_sequel",
    vSize = "total_revenue",
    vColor = "metascore",
    type = "value", 
    palette = "Reds",
    title = "Franchise/Sequel Movie Success",
    fontsize.labels = 12,
    border.col = "white"
  )

movies %>%
  filter(marvel == 1) %>%
  ggplot(aes(x = release_year, y = total_revenue/1000000)) + 
    geom_point(col = "gold") +
    geom_smooth(method = "lm", col = "tomato", se = FALSE) + 
    geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
    labs(x = "Release Year", y = "Revenue (millions)", title = "Marvel Movies") +
    theme(plot.title = element_text(hjust = 0.5))

movies %>%
  filter(dc == 1) %>%
  ggplot(aes(x = release_year, y = total_revenue/1000000)) + 
    geom_point(col = "gold") +
    geom_smooth(method = "lm", col = "tomato", se = FALSE) + 
    geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
    labs(x = "Release Year", y = "Revenue (millions)", title = "DC Movies") +
    theme(plot.title = element_text(hjust = 0.5))

movies %>%
  filter(harry_potter == 1) %>%
  ggplot(aes(x = release_year, y = total_revenue/1000000)) + 
    geom_point(col = "gold") +
    geom_smooth(method = "lm", col = "tomato", se = FALSE) + 
    geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
    labs(x = "Release Year", y = "Revenue (millions)", title = "Harry Potter Movies") +
    theme(plot.title = element_text(hjust = 0.5))

movies %>%
  filter(hunger_games == 1) %>%
  ggplot(aes(x = release_year, y = total_revenue/1000000)) + 
    geom_point(col = "gold") +
    geom_smooth(method = "lm", col = "tomato", se = FALSE) + 
    geom_text(aes(label = title), vjust = 1.5, size = 2.5) +
    labs(x = "Release Year", y = "Revenue (millions)", title = "Hunger Games Movies") +
    theme(plot.title = element_text(hjust = 0.5))

create variable for ratio of revenue to budget “total box office revenue is 10 times production budget”

movies$revenue_to_budget <- movies$total_revenue / movies$estimated_budget
which(movies$revenue_to_budget > 100)
##  [1] 1294 1652 1658 1659 1660 1662 1663 2226 2232 2234 2236
# leave out these movies as they are outliers

# look at the max revenue : budget
which(movies$revenue_to_budget == max(movies$revenue_to_budget))
## [1] 1663
movies[1663,]
# Paranormal Activity made more than 12000 times its budget

movies %>%
  filter(movies$revenue_to_budget < 100) %>%
  ggplot(aes(x = metascore, y = vote_average, size = revenue_to_budget)) +
  geom_point(col = "black", bg = "magenta", pch = 21)

export data to new csv

write.csv(movies, "movie_data_final.csv", row.names = FALSE)