library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.6 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.9
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(HelpR)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(tidyr)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(stats)
library(ggfortify)
library(ggplot2)
library(ggrepel)
library(statsr)
## Loading required package: BayesFactor
## Loading required package: coda
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.3. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(broom)
library(rmarkdown)
options(ggrepel.max.overlaps = Inf)
Movie data sets can contain a lot of information, from their genre, popularity and vote count. These data sets also include studio of production for each movie, and contain information regarding their original budget and gross revenue. Using just a select few Data Science methods we can begin to scratch the surface about what determines a good movie based on just a few of these metrics.
Loading in the data and renaming it for ease
movies <- read.csv('/Volumes/GoogleDrive-116895307585193883012/.shortcut-targets-by-id/0BwEyzS8OvJgreXdPNGZKV2tyRjg/Share_Clients/Data Science Project Work/Sandbox/Movies/data/tmdb_5000_movies.csv')
Extracting any Json data and reassigning them as new data frame object
movies_extracted <- movies%>%
filter(nchar(genres)>2) %>%
mutate(gen = lapply(genres, fromJSON)) %>%
filter(nchar(keywords)>2)%>%
mutate(keyw = lapply(keywords, fromJSON)) %>%
filter(nchar(production_companies)>2) %>%
mutate(prod_comp = lapply(production_companies, fromJSON)) %>%
filter(nchar(production_countries)>2) %>%
mutate(prod_count = lapply(production_countries, fromJSON)) %>%
filter(nchar(spoken_languages)>2) %>%
mutate(languages = lapply(spoken_languages, fromJSON))
genres <- movies %>%
filter(nchar(genres)>2) %>%
mutate(gen = lapply(genres, fromJSON)) %>%
unnest(gen, .name_repair = "unique") %>%
select(id, title, genres=name) %>%
mutate_if(is.character, factor)
keywords <- movies %>%
filter(nchar(keywords)>2) %>%
mutate(keyw = lapply(keywords, fromJSON)) %>%
unnest(keyw, .name_repair = "unique") %>%
select(id, title, keywords=name)
production_companies <- movies %>%
filter(nchar(production_companies)>2) %>%
mutate(prod_comp = lapply(production_companies, fromJSON)) %>%
unnest(prod_comp, .name_repair = "unique") %>%
select(id, title, name=name)
production_countries <- movies %>%
filter(nchar(production_countries)>2) %>%
mutate(prod_count = lapply(production_countries, fromJSON)) %>%
unnest(prod_count, .name_repair = "unique") %>%
select(iso_3166_1, title, name=name)
Plucked out the 3 genres for each film and binds them onto original data frame
genres3 <- genres
genres3$order <- 0
genres3$order[1] <- 1
for(i in 1:(nrow(genres3)-1)) {
if(genres3$id[i+1]!=genres3$id[i]){
genres3$order[i+1] <- 1
} else {genres3$order[i+1] <- (genres3$order[i])+1}
}
genres3 <- genres3 %>% filter(order < 4) %>%
spread(key=order, value=genres) %>%
rename(genre_1="1", genre_2="2", genre_3="3")
Plucked out the 3 keywords for each film and binds them onto original data frame
keywords3 <- keywords
keywords3$order <- 0
keywords3$order[1] <- 1
for(i in 1:(nrow(keywords3)-1)) {
if(keywords3$id[i+1]!=keywords3$id[i]){
keywords3$order[i+1] <- 1
} else {keywords3$order[i+1] <- (keywords3$order[i])+1}
}
keywords3 <- keywords3 %>% filter(order < 6) %>%
spread(key=order, value=keywords) %>%
rename(keyword_1="1", keyword_2="2", keyword_3="3", keyword_4="4", keyword_5="5")
Plucked out the production companies and binding back to original data frame
companies3 <- production_companies
companies3$order <- 0
companies3$order[1] <- 1
for(i in 1:(nrow(companies3)-1)) {
if(companies3$id[i+1]!=companies3$id[i]){
companies3$order[i+1] <- 1
} else {companies3$order[i+1] <- (companies3$order[i])+1}
}
companies3 <- companies3 %>% filter(order < 4) %>%
spread(key=order, value=name) %>%
rename(company_1="1", company_2="2", company_3="3")
Binding all separate objects to create one df with all variables within
movies3 <- left_join(movies, keywords3 %>%
select(id, keyword_1, keyword_2, keyword_3, keyword_4, keyword_5), by="id")
movies4 <- left_join(movies3, companies3 %>%
select(id, company_1, company_2, company_3), by="id")
movies5 <- left_join(movies4, genres3 %>%
select(id, genre_1, genre_2, genre_3),by="id")
We plotted some experimental plots to point out any variables of interest.
Once the data had been appropriately processed into a format which meant plotting the variables such as genres, production companies and keywords were easy to interpret. We then took upon experimental methods by plotting variables of interest such as movie budgets and revenue by genre.
# Remove genres which don't tell a story in viz
filtered_movies2 <- movies5 %>%
filter(genre_1 != "NA") %>%
filter(genre_1 != "Foreign") %>%
filter(genre_1 != "TV Movie")
# Removes any data which didn't tell a story outside of the given dates
filtered_movies2 <- filtered_movies2 %>%
filter(release_date >= as.Date("1964-12-25") & release_date <= as.Date("2017-02-03"))
# Facet viz with labels scaled with breaks and dollars to display information clearly
budget_revenue <- ggplot(data = filtered_movies2) +
geom_point(mapping = aes(x = budget, y = revenue, colour = genre_1)) +
facet_wrap(~ genre_1, nrow = 2) +
labs(title = "Budget and Revenue by Genre", x = "Movie Budget", y = "Gross Revenue") +
scale_y_continuous("Gross Revenue",
breaks = scales::breaks_extended(8),
labels = scales::label_dollar()) +
scale_x_continuous("Movie Budget",
breaks = scales::breaks_extended(3),
labels = scales::label_dollar())
budget_revenue +
scale_colour_viridis_d() +
theme_minimal() +
theme(legend.position = "none")
From this plot alone we can see that genres Action and Adventure (with the exception of some science fiction films) excel in terms of revenue, such movies require an initial upfront budget as seen by the plot.
From this, we now have an idea of what variables can be useful for telling a story, whether it be genre success or other contributing factors such as production company, but it is possible to gain insight once a general direction of interest has been set.
Another variable which is useful for analysis, release date and the potential this had with gaining further insight mapping variables over time. For example, we looked at the change in budgets overtime, along with revenue.
ggplot(data = filtered_movies2, aes(x = release_date,
y = budget,
colour = genre_1)) +
geom_point() +
theme_minimal() +
scale_colour_viridis_d(option = "plasma") +
labs(x = "Release Date",
y = "Movie Budgets (Dollars)",
title = "Movie Budgets Over Time",
colour = "Genres") +
scale_y_continuous("Movie Budget",
breaks = scales::breaks_extended(8),
labels = scales::label_dollar())
ggplot(data = filtered_movies2, aes(x = release_date,
y = revenue,
colour = genre_1)) +
geom_point() +
theme_minimal() +
scale_colour_viridis_d() +
labs(x = "Release Date",
y = "Gross Revenue (Dollars)",
title = "Revenue Over Time",
colour = "Genres") +
scale_y_continuous("Gross Revenue (Dollars)",
breaks = scales::breaks_extended(8),
labels = scales::label_dollar())
We had originally looked at the entire data set, mapping popularity over time but the visualization was poor and included lots of data that didn’t necessarily tell a story. So we filtered it down to exclude any movies released before 1965.
ggplot(data = filtered_movies2, aes(x = release_date, y = popularity, colour = genre_1)) +
geom_point() +
theme_minimal() +
scale_colour_viridis_d() +
labs(x = "Release Date",
y = "Popularity",
title = "Popularity Over Time",
colour = "Genres")
Now looking at the same data facetted, it’s clear to see the change in popularity of each genre over time, much clearer than the original scatter plot.
Thinking analytically, it’s clear now that movies post 2010 are becoming increasingly popular across all genres, possibly due to newer technologies made possible with the increasingly large budgets movies now have in recent years. However, you can imagine any modern day actor/actress will argue differently…
But there’s more, looking at this plot one can see spike in popularity within the 1970’s and 1980’s across genres; Action, Adventure, Horror and Sci Fi. And when we further look into this era and the feature films released, it’s not hard to believe what we see. (Top Gun, The Goonies, A Nightmare on Elm Street, The Terminator and of course, Back to the Future). The same can also be seen in the period of 1965-1975 with Western films, most likely a direct influence from Clint Eastwood and the iconic Western movies released during this time.
genre_popularity <- ggplot(data = filtered_movies2) +
geom_point(mapping = aes(x = release_date, y = popularity, colour = genre_1)) +
facet_wrap(~ genre_1, nrow = 2) +
labs(title = "Popularity of Genres over time", x = "Release Date", y = "Movie Popularity") +
theme_minimal() +
theme(legend.position = "none")
genre_popularity +
scale_color_viridis_d()
revpop_bygenre <- ggplot(data = filtered_movies2, aes(x = revenue, y = popularity, colour = genre_1)) +
geom_point() +
labs(x = "Movie Revenue",
y = "Popularity",
title = "Revenue and Popularity by Genre",
colour = "Genres") +
scale_x_continuous("Movie Revenue",
breaks = scales::breaks_extended(10),
labels = scales::label_dollar())
revpop_bygenre +
theme_minimal() +
scale_colour_viridis_d()
genre_success <- ggplot(data = filtered_movies2) +
geom_point(mapping = aes(x = revenue, y = popularity, colour = genre_1)) +
facet_wrap(~ genre_1, nrow = 2) +
labs(title = "Success of Genres mapped by revenue and popularity", x = "Movie Revenue", y = "Movie Popularity") +
theme_minimal() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
scale_x_continuous("Movie Revenue",
breaks = scales::breaks_extended(10),
labels = scales::label_dollar())
genre_success +
scale_colour_viridis_d()
Now that we have looked at genre success over the years, how can we further analyse the same set of data and tell a more descriptive story?
Can we identify what companies perform best overall?
With some more Data processing/wrangling we can create an object with the mean revenue, mean popularity and the n/movies released by each of the production companies.
Shows all companies in descending order in terms of movies released
companies_desc <- filtered_movies2 %>%
count(company_1) %>%
arrange(desc(n)) %>%
mutate(total_movies = factor(n)) %>%
filter(company_1 != "NA")
Creates an object with the revenue summary of companies
company_revenue <- filtered_movies2 %>%
group_by(company_1) %>%
summarise(mean_revenue = mean(revenue)) %>%
filter(company_1 != "NA")
Creates an object with popularity summary of companies
company_popularity <- filtered_movies2 %>%
group_by(company_1) %>%
summarise(mean_popularity = mean(popularity)) %>%
filter(company_1 != "NA")
Shows the most common genres throughout
# Created an object with genres in descending in terms of volume
genres_desc <- filtered_movies2 %>%
count(genre_1) %>%
arrange(desc(n)) %>%
mutate(genres = factor(n)) %>%
filter(genre_1 != "NA")
Joined the objects created about company performance by; volume, revenue and popularity, Sampling top 15 companies descending by volume
joined_revpop <- left_join(companies_desc, company_revenue, by = "company_1")
joined_revpop <- left_join(joined_revpop, company_popularity, by = "company_1")
companies_top15 <- joined_revpop %>%
dplyr::slice_head(n = 15)
companies_top25 <- joined_revpop %>%
dplyr::slice_head(n = 25)
With the data wrangled and a new object created, we can now select the top 25 or top 15 production companies based on volume of released movies to analyse these and visualize any variables which determine company success.
With a linear model displayed on the plot, we can start to see which companies have high means of revenue a d popularity mapping their success by way of regression. Here we can understand that companies such as Walt Disney and Summit entertainment score exceptionally high in popularity across their released movies and receive a high mean revenue in return. While companies such as Metro-Goldwyn-Mayer and Touchstone seem to be struggling in comparison.
smooth_scatter <- ggplot(data = companies_top25,
mapping = aes(x = mean_revenue,
y = mean_popularity,
label = company_1)) +
geom_point(size = 4,
aes(colour = company_1)) +
geom_label_repel(label.size = 0.25, label.r = 0.05) +
geom_smooth() +
theme_minimal() +
labs(x = "Mean Average Revenue",
y = "Mean Movie Popularity",
title = "Company Success by Mean Revenue and Popularity",
colour = "Production Companies") +
scale_x_continuous("Mean Revenue",
breaks = scales::breaks_extended(10),
labels = scales::label_dollar()) +
theme(legend.position = "none")
smooth_scatter +
scale_colour_viridis_d()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Now we wanted to use a more computerized approach to exploring the same content. K-means clustering is a method of clustering n/movies into k clusters by way of assigning movies to the nearest cluster in mean values. It is an unsupervised machine learning technique which aims to partition observations accurately within a data set.
Creates a data frame with only the numeric variables within
reduced_data_numeric <- select(filtered_movies2, c(id, budget, popularity, revenue, runtime, vote_average, vote_count))
tmp_ids <- reduced_data_numeric %>%
pull(id)
tmp_ids_removed <- reduced_data_numeric %>% na.omit() %>% tibble() %>%
pull(id)
ids_to_filter <- setdiff(tmp_ids, tmp_ids_removed)
reduced_data_numeric <- reduced_data_numeric %>%
filter(!id %in% ids_to_filter)
filtered_movies2 <- filtered_movies2 %>%
filter(id %in% reduced_data_numeric$id)
Once created, it were possible to plot the k means cluster and observe total of n/movies in each cluster and their relative mean values such as; average popularity, run time etc.
K means with k equal to 5 (5 clusters)
KM <- kmeans(na.omit(reduced_data_numeric), 5)
cluster_plot <- autoplot(KM, na.omit(reduced_data_numeric), frame=TRUE)
cluster_plot +
scale_color_viridis_d() +
theme_minimal()
These mean values can be seen below by way of plotting a table using
package kable.
From this we can see what each cluster is in respects to this data. The least ‘successful’ cluster being the most populated and with the lowest average scores for all of the metrics other than size. Seemingly the most ‘successful’ of the clusters, has the longest mean run time, with high mean scores across all metrics except size.
More observations can be made by looking at this table. Noticeably, when focusing on ‘success’ of clusters (defined by mean popularity and revenue) it’s possible to see a correlation between cluster ‘success’ and run time. With the more successful clusters having a higher average run time, and this average value begins to descend as the clusters become less ‘successful’ across other metrics, scoring lower on all other metrics. Again, we can now start to tell a story about the likelihood of a movie’s ‘success’ based upon metrics displayed here.
tibble_KM <- tidy(KM)
tibble_KM <- relocate(tibble_KM, cluster,.before = budget, after = NULL)
kbl(tibble_KM) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| id | cluster | budget | popularity | revenue | runtime | vote_average | vote_count | size | withinss |
|---|---|---|---|---|---|---|---|---|---|
| 42782.95 | 1 | 90377950 | 53.97796 | 336236357 | 116.1056 | 6.479503 | 2272.9783 | 322 | 2.632697e+18 |
| 71517.77 | 2 | 208269231 | 158.30244 | 1228958349 | 144.2308 | 7.065385 | 6586.1923 | 26 | 3.847668e+18 |
| 44898.10 | 3 | 48849253 | 32.92726 | 132340955 | 112.4378 | 6.328587 | 1057.4676 | 941 | 2.888917e+18 |
| 61237.58 | 4 | 13243066 | 11.58566 | 14428973 | 103.5103 | 5.954191 | 261.2515 | 3257 | 2.340302e+18 |
| 58066.78 | 5 | 133229741 | 97.50760 | 695507260 | 121.5862 | 6.877586 | 4566.8966 | 116 | 2.221446e+18 |
Now we have created a k-means cluster for movies based on their commercial success.
While all of this information is rather interesting, it’s unclear as to what movies exactly fit inside of each of the five clusters. So with a little bit of wrangling using code in R we can assign each of the movies within the data set to their respective cluster.
Assigning cluster numbers to all the movies in the original df
movies_by_cluster <- KM$cluster %>%
tidy() %>%
rename(cluster = x)
filtered_movies2 <- filtered_movies2 %>% mutate(cluster = movies_by_cluster$cluster)
Once the original data frame now has this information mutated onto it, it is possible to start plotting the frequency of movie count in each cluster by their genre category. Doing so we can determine which genres are most likely to be included within each cluster, further developing an understanding of the commercial success of movies.
Genre data organised into ascending order
theTable <- within(filtered_movies2,
genre_1 <- factor(genre_1,
levels = names(sort(table(genre_1), decreasing = TRUE))))
jitter_plot <- ggplot(theTable, aes(y = cluster, x = genre_1, fill = genre_1)) +
geom_point(shape = 21, colour = "black", alpha = 0.5, position = "jitter") +
scale_fill_viridis_d() +
labs(title = "Genre frequency per cluster", y = "Cluster", x = "Genre") +
theme_minimal()+
theme(legend.position = "none")
jitter_plot