library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.6     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.9
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(HelpR)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(tidyr)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(stats)
library(ggfortify)
library(ggplot2)
library(ggrepel)
library(statsr)
## Loading required package: BayesFactor
## Loading required package: coda
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.3. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(broom)
library(rmarkdown)
options(ggrepel.max.overlaps = Inf)

Introduction

Movie data sets can contain a lot of information, from their genre, popularity and vote count. These data sets also include studio of production for each movie, and contain information regarding their original budget and gross revenue. Using just a select few Data Science methods we can begin to scratch the surface about what determines a good movie based on just a few of these metrics.

Loading in the data and renaming it for ease

movies <- read.csv('/Volumes/GoogleDrive-116895307585193883012/.shortcut-targets-by-id/0BwEyzS8OvJgreXdPNGZKV2tyRjg/Share_Clients/Data Science Project Work/Sandbox/Movies/data/tmdb_5000_movies.csv')

Extracting any Json data and reassigning them as new data frame object

movies_extracted <- movies%>%
  filter(nchar(genres)>2) %>%
  mutate(gen = lapply(genres, fromJSON)) %>% 
  filter(nchar(keywords)>2)%>%
  mutate(keyw = lapply(keywords, fromJSON)) %>% 
  filter(nchar(production_companies)>2) %>%
  mutate(prod_comp = lapply(production_companies, fromJSON)) %>% 
  filter(nchar(production_countries)>2) %>%
  mutate(prod_count = lapply(production_countries, fromJSON)) %>% 
  filter(nchar(spoken_languages)>2) %>%
  mutate(languages = lapply(spoken_languages, fromJSON))

genres <- movies %>% 
  filter(nchar(genres)>2) %>%                 
  mutate(gen = lapply(genres, fromJSON)) %>% 
  unnest(gen, .name_repair = "unique") %>%    
  select(id, title, genres=name) %>% 
  mutate_if(is.character, factor)

keywords <- movies %>% 
  filter(nchar(keywords)>2) %>%                 
  mutate(keyw = lapply(keywords, fromJSON)) %>% 
  unnest(keyw, .name_repair = "unique") %>%    
  select(id, title, keywords=name)

production_companies <- movies %>% 
  filter(nchar(production_companies)>2) %>%                 
  mutate(prod_comp = lapply(production_companies, fromJSON)) %>% 
  unnest(prod_comp, .name_repair = "unique") %>%    
  select(id, title, name=name)

production_countries <- movies %>% 
  filter(nchar(production_countries)>2) %>%                 
  mutate(prod_count = lapply(production_countries, fromJSON)) %>% 
  unnest(prod_count, .name_repair = "unique") %>%    
  select(iso_3166_1, title, name=name)

Plucked out the 3 genres for each film and binds them onto original data frame

genres3 <- genres
genres3$order <- 0
genres3$order[1] <- 1


for(i in 1:(nrow(genres3)-1)) {
  if(genres3$id[i+1]!=genres3$id[i]){
    genres3$order[i+1] <- 1
  } else {genres3$order[i+1] <- (genres3$order[i])+1}
}

genres3 <- genres3 %>% filter(order < 4) %>%
  spread(key=order, value=genres) %>%
  rename(genre_1="1", genre_2="2", genre_3="3")

Plucked out the 3 keywords for each film and binds them onto original data frame

keywords3 <- keywords
keywords3$order <- 0
keywords3$order[1] <- 1


for(i in 1:(nrow(keywords3)-1)) {
  if(keywords3$id[i+1]!=keywords3$id[i]){
    keywords3$order[i+1] <- 1
  } else {keywords3$order[i+1] <- (keywords3$order[i])+1}
}

keywords3 <- keywords3 %>% filter(order < 6) %>%
  spread(key=order, value=keywords) %>%
  rename(keyword_1="1", keyword_2="2", keyword_3="3", keyword_4="4", keyword_5="5")

Plucked out the production companies and binding back to original data frame

companies3 <- production_companies
companies3$order <- 0
companies3$order[1] <- 1


for(i in 1:(nrow(companies3)-1)) {
  if(companies3$id[i+1]!=companies3$id[i]){
    companies3$order[i+1] <- 1
  } else {companies3$order[i+1] <- (companies3$order[i])+1}
}

companies3 <- companies3 %>% filter(order < 4) %>%
  spread(key=order, value=name) %>%
  rename(company_1="1", company_2="2", company_3="3")

Binding all separate objects to create one df with all variables within

movies3 <- left_join(movies, keywords3 %>% 
                       select(id, keyword_1, keyword_2, keyword_3, keyword_4, keyword_5), by="id")


movies4 <- left_join(movies3, companies3 %>% 
                       select(id, company_1, company_2, company_3), by="id")

movies5 <- left_join(movies4, genres3 %>% 
                       select(id, genre_1, genre_2, genre_3),by="id")

Experimental Plots.

We plotted some experimental plots to point out any variables of interest.

Once the data had been appropriately processed into a format which meant plotting the variables such as genres, production companies and keywords were easy to interpret. We then took upon experimental methods by plotting variables of interest such as movie budgets and revenue by genre.

Looking at budget and revenue by genre

# Remove genres which don't tell a story in viz
filtered_movies2 <- movies5 %>% 
  filter(genre_1 != "NA") %>% 
  filter(genre_1 != "Foreign") %>% 
  filter(genre_1 != "TV Movie") 

# Removes any data which didn't tell a story outside of the given dates
filtered_movies2 <- filtered_movies2 %>% 
  filter(release_date >= as.Date("1964-12-25") & release_date <= as.Date("2017-02-03"))

# Facet viz with labels scaled with breaks and dollars to display information clearly
budget_revenue <- ggplot(data = filtered_movies2) + 
  geom_point(mapping = aes(x = budget, y = revenue, colour = genre_1)) + 
  facet_wrap(~ genre_1, nrow = 2) +
  labs(title = "Budget and Revenue by Genre", x = "Movie Budget", y = "Gross Revenue") +
  scale_y_continuous("Gross Revenue", 
                       breaks = scales::breaks_extended(8),
                       labels = scales::label_dollar()) + 
  scale_x_continuous("Movie Budget",
                     breaks = scales::breaks_extended(3),
                     labels = scales::label_dollar()) 
  
  
  budget_revenue +
    scale_colour_viridis_d() +
    theme_minimal() +
    theme(legend.position = "none")

From this plot alone we can see that genres Action and Adventure (with the exception of some science fiction films) excel in terms of revenue, such movies require an initial upfront budget as seen by the plot.

From this, we now have an idea of what variables can be useful for telling a story, whether it be genre success or other contributing factors such as production company, but it is possible to gain insight once a general direction of interest has been set.

Another variable which is useful for analysis, release date and the potential this had with gaining further insight mapping variables over time. For example, we looked at the change in budgets overtime, along with revenue.

Movie Budgets over time

ggplot(data = filtered_movies2, aes(x = release_date, 
                                    y = budget,
                                    colour = genre_1)) +
  geom_point() +
  theme_minimal() +
  scale_colour_viridis_d(option = "plasma") +
  labs(x = "Release Date",
    y = "Movie Budgets (Dollars)",
    title = "Movie Budgets Over Time",
    colour = "Genres") +
   scale_y_continuous("Movie Budget", 
                       breaks = scales::breaks_extended(8),
                       labels = scales::label_dollar()) 

Gross Revenue over time

ggplot(data = filtered_movies2, aes(x = release_date,
                                    y = revenue,
                                    colour = genre_1)) +
  geom_point() +
  theme_minimal() +
  scale_colour_viridis_d() +
  labs(x = "Release Date",
    y = "Gross Revenue (Dollars)",
    title = "Revenue Over Time",
    colour = "Genres") +
   scale_y_continuous("Gross Revenue (Dollars)", 
                       breaks = scales::breaks_extended(8),
                       labels = scales::label_dollar())

We had originally looked at the entire data set, mapping popularity over time but the visualization was poor and included lots of data that didn’t necessarily tell a story. So we filtered it down to exclude any movies released before 1965.

Scatter plot of movie popularity over time, by genre

ggplot(data = filtered_movies2, aes(x = release_date, y = popularity, colour = genre_1)) +
  geom_point() +
  theme_minimal() +
  scale_colour_viridis_d() +
  labs(x = "Release Date",
    y = "Popularity",
    title = "Popularity Over Time",
    colour = "Genres") 

Now looking at the same data facetted, it’s clear to see the change in popularity of each genre over time, much clearer than the original scatter plot.

Thinking analytically, it’s clear now that movies post 2010 are becoming increasingly popular across all genres, possibly due to newer technologies made possible with the increasingly large budgets movies now have in recent years. However, you can imagine any modern day actor/actress will argue differently…

But there’s more, looking at this plot one can see spike in popularity within the 1970’s and 1980’s across genres; Action, Adventure, Horror and Sci Fi. And when we further look into this era and the feature films released, it’s not hard to believe what we see. (Top Gun, The Goonies, A Nightmare on Elm Street, The Terminator and of course, Back to the Future). The same can also be seen in the period of 1965-1975 with Western films, most likely a direct influence from Clint Eastwood and the iconic Western movies released during this time.

Movie popularity over time, by genre

genre_popularity <- ggplot(data = filtered_movies2) + 
  geom_point(mapping = aes(x = release_date, y = popularity, colour = genre_1)) + 
  facet_wrap(~ genre_1, nrow = 2) +
  labs(title = "Popularity of Genres over time", x = "Release Date", y = "Movie Popularity") +
  theme_minimal() +
  theme(legend.position = "none") 

genre_popularity +
  scale_color_viridis_d()

Scatter plot of popularity and revenue by genre

revpop_bygenre <- ggplot(data = filtered_movies2, aes(x = revenue, y = popularity, colour = genre_1)) +
  geom_point() +
  labs(x = "Movie Revenue",
    y = "Popularity",
    title = "Revenue and Popularity by Genre",
    colour = "Genres") +
  scale_x_continuous("Movie Revenue", 
                       breaks = scales::breaks_extended(10),
                       labels = scales::label_dollar())

revpop_bygenre + 
  theme_minimal() + 
  scale_colour_viridis_d()

Facetted plot of revenue and popularity by genre

genre_success <- ggplot(data = filtered_movies2) + 
  geom_point(mapping = aes(x = revenue, y = popularity, colour = genre_1)) + 
  facet_wrap(~ genre_1, nrow = 2) +
  labs(title = "Success of Genres mapped by revenue and popularity", x = "Movie Revenue", y = "Movie Popularity") +
  theme_minimal() +
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) + 
  scale_x_continuous("Movie Revenue", 
                       breaks = scales::breaks_extended(10),
                       labels = scales::label_dollar())

genre_success +
  scale_colour_viridis_d() 

Now that we have looked at genre success over the years, how can we further analyse the same set of data and tell a more descriptive story?

Can we identify what companies perform best overall?

With some more Data processing/wrangling we can create an object with the mean revenue, mean popularity and the n/movies released by each of the production companies.

Shows all companies in descending order in terms of movies released

companies_desc <- filtered_movies2 %>% 
  count(company_1) %>% 
  arrange(desc(n)) %>% 
  mutate(total_movies = factor(n)) %>% 
  filter(company_1 != "NA")

Creates an object with the revenue summary of companies

company_revenue <- filtered_movies2 %>% 
  group_by(company_1) %>% 
  summarise(mean_revenue = mean(revenue)) %>% 
  filter(company_1 != "NA")

Creates an object with popularity summary of companies

company_popularity <- filtered_movies2 %>% 
  group_by(company_1) %>% 
  summarise(mean_popularity = mean(popularity)) %>% 
  filter(company_1 != "NA")

Shows the most common genres throughout

# Created an object with genres in descending in terms of volume  
genres_desc <- filtered_movies2 %>% 
  count(genre_1) %>% 
  arrange(desc(n)) %>% 
  mutate(genres = factor(n)) %>% 
  filter(genre_1 != "NA")

Joined the objects created about company performance by; volume, revenue and popularity, Sampling top 15 companies descending by volume

joined_revpop <- left_join(companies_desc, company_revenue, by = "company_1")
joined_revpop <- left_join(joined_revpop, company_popularity, by = "company_1")

companies_top15 <- joined_revpop %>% 
  dplyr::slice_head(n = 15)

companies_top25 <- joined_revpop %>% 
  dplyr::slice_head(n = 25)

With the data wrangled and a new object created, we can now select the top 25 or top 15 production companies based on volume of released movies to analyse these and visualize any variables which determine company success.

With a linear model displayed on the plot, we can start to see which companies have high means of revenue a d popularity mapping their success by way of regression. Here we can understand that companies such as Walt Disney and Summit entertainment score exceptionally high in popularity across their released movies and receive a high mean revenue in return. While companies such as Metro-Goldwyn-Mayer and Touchstone seem to be struggling in comparison.

Scatter plot with a smooth linear model overlay

smooth_scatter <- ggplot(data = companies_top25, 
                         mapping = aes(x = mean_revenue, 
                                       y = mean_popularity,
                                       label = company_1)) +
  geom_point(size = 4,
             aes(colour = company_1)) + 
  geom_label_repel(label.size = 0.25, label.r = 0.05) +
  geom_smooth() +
  theme_minimal() +
  labs(x = "Mean Average Revenue",
    y = "Mean Movie Popularity",
    title = "Company Success by Mean Revenue and Popularity",
    colour = "Production Companies") +
  scale_x_continuous("Mean Revenue",
                     breaks = scales::breaks_extended(10),
                     labels = scales::label_dollar()) +
  theme(legend.position = "none")

smooth_scatter +
  scale_colour_viridis_d() 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Now we wanted to use a more computerized approach to exploring the same content. K-means clustering is a method of clustering n/movies into k clusters by way of assigning movies to the nearest cluster in mean values. It is an unsupervised machine learning technique which aims to partition observations accurately within a data set.

K-Means Cluster Analysis

Creates a data frame with only the numeric variables within

reduced_data_numeric <- select(filtered_movies2, c(id, budget, popularity, revenue, runtime, vote_average, vote_count))

tmp_ids <- reduced_data_numeric %>%
  pull(id)

tmp_ids_removed <- reduced_data_numeric %>% na.omit() %>% tibble() %>%
  pull(id)

ids_to_filter <- setdiff(tmp_ids, tmp_ids_removed)

reduced_data_numeric <- reduced_data_numeric %>%
  filter(!id %in% ids_to_filter) 

filtered_movies2 <- filtered_movies2 %>% 
  filter(id %in% reduced_data_numeric$id)

Once created, it were possible to plot the k means cluster and observe total of n/movies in each cluster and their relative mean values such as; average popularity, run time etc.

K means with k equal to 5 (5 clusters)

KM <- kmeans(na.omit(reduced_data_numeric), 5)

Cluster plot visualization

cluster_plot <- autoplot(KM, na.omit(reduced_data_numeric), frame=TRUE) 

cluster_plot + 
  scale_color_viridis_d() + 
  theme_minimal()

These mean values can be seen below by way of plotting a table using package kable.

From this we can see what each cluster is in respects to this data. The least ‘successful’ cluster being the most populated and with the lowest average scores for all of the metrics other than size. Seemingly the most ‘successful’ of the clusters, has the longest mean run time, with high mean scores across all metrics except size.

More observations can be made by looking at this table. Noticeably, when focusing on ‘success’ of clusters (defined by mean popularity and revenue) it’s possible to see a correlation between cluster ‘success’ and run time. With the more successful clusters having a higher average run time, and this average value begins to descend as the clusters become less ‘successful’ across other metrics, scoring lower on all other metrics. Again, we can now start to tell a story about the likelihood of a movie’s ‘success’ based upon metrics displayed here.

Kable table with information in KM statistical object

tibble_KM <- tidy(KM)

tibble_KM <- relocate(tibble_KM, cluster,.before = budget, after = NULL)

kbl(tibble_KM) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
id cluster budget popularity revenue runtime vote_average vote_count size withinss
42782.95 1 90377950 53.97796 336236357 116.1056 6.479503 2272.9783 322 2.632697e+18
71517.77 2 208269231 158.30244 1228958349 144.2308 7.065385 6586.1923 26 3.847668e+18
44898.10 3 48849253 32.92726 132340955 112.4378 6.328587 1057.4676 941 2.888917e+18
61237.58 4 13243066 11.58566 14428973 103.5103 5.954191 261.2515 3257 2.340302e+18
58066.78 5 133229741 97.50760 695507260 121.5862 6.877586 4566.8966 116 2.221446e+18

Now we have created a k-means cluster for movies based on their commercial success.

While all of this information is rather interesting, it’s unclear as to what movies exactly fit inside of each of the five clusters. So with a little bit of wrangling using code in R we can assign each of the movies within the data set to their respective cluster.

Assigning cluster numbers to all the movies in the original df

movies_by_cluster <- KM$cluster %>%
  tidy() %>%
  rename(cluster = x)

filtered_movies2 <- filtered_movies2 %>% mutate(cluster = movies_by_cluster$cluster)

Once the original data frame now has this information mutated onto it, it is possible to start plotting the frequency of movie count in each cluster by their genre category. Doing so we can determine which genres are most likely to be included within each cluster, further developing an understanding of the commercial success of movies.

Genre data organised into ascending order

theTable <- within(filtered_movies2, 
                   genre_1 <- factor(genre_1, 
                                      levels = names(sort(table(genre_1), decreasing = TRUE))))

Jitter Plot with genre frequency per topic

jitter_plot <- ggplot(theTable, aes(y = cluster, x = genre_1, fill = genre_1)) +
  geom_point(shape = 21, colour = "black", alpha = 0.5, position = "jitter") +
  scale_fill_viridis_d() +
  labs(title = "Genre frequency per cluster", y = "Cluster", x = "Genre") +
  theme_minimal()+
  theme(legend.position = "none")

jitter_plot