library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggridges) # unused in final blog post
library(scales,warn.conflicts = FALSE)
library(tidytext) # unused in final blog post
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidytext_0.3.1 scales_1.1.1 ggridges_0.5.3 forcats_0.5.0
## [5] stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4 readr_1.4.0
## [9] tidyr_1.1.2 tibble_3.0.4 ggplot2_3.3.3 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.19 lattice_0.20-41 haven_2.3.1
## [5] colorspace_2.0-0 vctrs_0.3.5 generics_0.1.0 SnowballC_0.7.0
## [9] htmltools_0.5.0 yaml_2.2.1 rlang_0.4.10 pillar_1.4.7
## [13] glue_1.4.2 withr_2.3.0 DBI_1.1.0 dbplyr_2.0.0
## [17] modelr_0.1.8 readxl_1.3.1 lifecycle_0.2.0 plyr_1.8.6
## [21] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0 rvest_0.3.6
## [25] evaluate_0.14 knitr_1.30 fansi_0.4.1 tokenizers_0.2.1
## [29] broom_0.7.2 Rcpp_1.0.6 backports_1.2.1 jsonlite_1.7.2
## [33] fs_1.5.0 hms_0.5.3 digest_0.6.27 stringi_1.5.3
## [37] grid_4.0.3 cli_2.2.0 tools_4.0.3 magrittr_2.0.1
## [41] janeaustenr_0.1.5 crayon_1.3.4 pkgconfig_2.0.3 Matrix_1.2-18
## [45] ellipsis_0.3.1 xml2_1.3.2 reprex_0.3.0 lubridate_1.7.9.2
## [49] assertthat_0.2.1 rmarkdown_2.6 httr_1.4.2 rstudioapi_0.13
## [53] R6_2.5.0 compiler_4.0.3
Helper function to read IMDB files given filename.
read_imdb <- function(data_path) {
path <- "/Users/ankurhusky71/Desktop/IMDB_datasets/"
read_tsv(paste0(path, data_path), na = "\\N", quote='', progress=F)
}
Helper function to pretty print the size of a dataframe for charts/notebook.
ppdf <- function(df) {
df %>% nrow() %>% comma()
}
df_ratings <- read_imdb("title.ratings.tsv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## averageRating = col_double(),
## numVotes = col_double()
## )
df_ratings %>% head()
Plot a 2D histogram and clean up axes.
plot2D_histogram <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) +
geom_bin2d() +
scale_x_log10(labels = comma) +
scale_y_continuous(breaks = 1:10) +
scale_fill_viridis_c(labels = comma)
plot2D_histogram
df_basics <- read_imdb("title.basics.tsv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## titleType = col_character(),
## primaryTitle = col_character(),
## originalTitle = col_character(),
## isAdult = col_double(),
## startYear = col_double(),
## endYear = col_logical(),
## runtimeMinutes = col_double(),
## genres = col_character()
## )
## Warning: 75427 parsing failures.
## row col expected actual file
## 34983 endYear 1/0/T/F/TRUE/FALSE 1947 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 35184 endYear 1/0/T/F/TRUE/FALSE 1945 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 37613 endYear 1/0/T/F/TRUE/FALSE 1955 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 38447 endYear 1/0/T/F/TRUE/FALSE 1949 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 38448 endYear 1/0/T/F/TRUE/FALSE 1949 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## ..... ....... .................. ...... ............................................................
## See problems(...) for more details.
df_basics %>% head()
There are 7,824,174 titles in the dataset.
Merge df_ratings and df_basics to perform ratings/vote analysis using more metadata.
df ratings left join with df_basics
df_ratings <- df_ratings %>% left_join(df_basics)
## Joining, by = "tconst"
df_ratings %>% head()
Which movies have the superhigh runtimes?
df_ratings %>% arrange(desc(runtimeMinutes)) %>% head(10)
plot_movie_runtime <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10), aes(x = runtimeMinutes, y = averageRating)) +
geom_bin2d() +
scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
scale_y_continuous(breaks = 0:10) +
scale_fill_viridis_c(option = "inferno", labels = comma) +
# theme_minimal(base_family = "Source Sans Pro", base_size=8) +
labs(title="Relationship between Movie Runtime and Average Movie Rating",
subtitle="Data from IMDb retrieved July 4th, 2018",
x="Runtime (Hours)",
y="Average User Rating",
caption="Max Woolf — minimaxir.com",
fill="# Movies")
The color of the cell represents the Number of Movies
plot_movie_runtime
Set theme to custom theme based on theme_minimal for the rest of the notebook.
theme_set(theme_minimal(base_size=9) +
theme(plot.title = element_text(size=8, margin=margin(t = -0.1, b = 0.1, unit='cm')),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
plot.subtitle = element_text(color="#969696", size=6),
plot.caption = element_text(size=6, color="#969696"),
legend.title = element_text(size=8),
legend.key.width = unit(0.25, unit='cm')))
plot_ratings_vs_movieYear <- ggplot(df_ratings %>% filter(titleType=="movie", numVotes >= 10), aes(x = startYear, y = averageRating)) +
geom_bin2d() +
geom_smooth(color="black") +
scale_x_continuous() +
scale_y_continuous(breaks = 1:10) +
scale_fill_viridis_c(option = "plasma", labels = comma, trans='log10') +
labs(title="Relationship between Movie Release Year and Average Rating",
subtitle=sprintf("For %s Movies/Ratings. Data from IMDb retrieved 7/4/2018", df_ratings %>% filter(titleType=="movie", numVotes >= 10) %>% ppdf),
x="Year Movie was Released",
y="Average User Rating For Movie",
caption="Max Woolf — minimaxir.com",
fill="# Movies")
1.The x axis represents the year in which Movie was released 2.The Y represents the Average Rating 3.The Color Represents the Number of Such Movies
plot_ratings_vs_movieYear
## Warning: Removed 22 rows containing non-finite values (stat_bin2d).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 22 rows containing non-finite values (stat_smooth).
plot_ridge <- ggplot(df_ratings %>% filter(startYear >= 2000, titleType=="movie", numVotes >= 10) %>% mutate(startYear = factor(startYear)), aes(x = averageRating, y = startYear, fill=startYear)) +
geom_density_ridges() +
scale_fill_hue(guide=F) +
scale_x_continuous(breaks = 1:10) +
theme_minimal( base_size=9)
Average Ratings Vs Start Year
plot_ridge
## Picking joint bandwidth of 0.261
Bucket by decades.
df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## parentTconst = col_character(),
## seasonNumber = col_double(),
## episodeNumber = col_double()
## )
df_episode %>% head()
#df_episode_count <- df_episode %>%
# group_by(parentTconst, seasonNumber) %>%
# tally() %>%
# left_join(df_basics, c("parentTconst" = "tconst"))
#df_episode_count %>% head()
str_detect is vectorized and much faster than using a loop/lapply. Using a regular expression to search for actor or actress is another speed increase.
df_actors <- read_imdb("name.basics.tsv") %>%
filter(str_detect(primaryProfession, "actor|actress")) %>%
select(nconst, primaryName, birthYear)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## nconst = col_character(),
## primaryName = col_character(),
## birthYear = col_double(),
## deathYear = col_double(),
## primaryProfession = col_character(),
## knownForTitles = col_character()
## )
df_actors %>% head()
There are 4,010,259 actors in the dataset. # df_principals
df_principals <- read_imdb("title.principals.tsv") %>%
filter(str_detect(category, "actor|actress")) %>%
select(tconst, ordering, nconst, category) %>%
group_by(tconst) %>%
filter(ordering == min(ordering))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## ordering = col_double(),
## nconst = col_character(),
## category = col_character(),
## job = col_character(),
## characters = col_character()
## )
df_principals %>% head()
There are 4,389,097 principals/rows in the dataset.
Join the 2 dataframes. (onto principals, since Many-to-One)
df_principals <- df_principals %>% left_join(df_actors)
## Joining, by = "nconst"
df_principals %>% head()
Merge actor information onto the full ratings dataframe.
df_ratings <- df_ratings %>% left_join(df_principals)
## Joining, by = "tconst"
df_ratings %>% head()
Filter down to movies w/ actor info. (only if the birth year is present in the data)
df_ratings_movies <- df_ratings %>%
filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
mutate(age_lead = startYear - birthYear) %>%
arrange(desc(numVotes))
df_ratings_movies %>% head(100)
Aggregate lead-actor/actress ages by movie year w/ percentiles.
df_actor_ages <- df_ratings_movies %>%
group_by(startYear) %>%
summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
med_age = quantile(age_lead, 0.50, na.rm=T),
high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
arrange(startYear)
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages %>% head()
Create a ribbon plot.
NB: Plot the ribbon before the line, so the line is on top.
plot_ages <- ggplot(df_actor_ages %>% filter(startYear >= 1920) , aes(x = startYear)) +
geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2) +
geom_line(aes(y=med_age)) +
labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
x="Year Movie was Released",
y="Age of Lead Actor/Actress",
caption="Max Woolf — minimaxir.com",
fill="# Movies")
plot_ages
Create a plot comparing actors/actresses.
df_actor_ages_lead <- df_ratings_movies %>%
group_by(startYear, category) %>%
summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
med_age = quantile(age_lead, 0.50, na.rm=T),
high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
arrange(startYear)
## `summarise()` regrouping output by 'startYear' (override with `.groups` argument)
df_actor_ages_lead %>% head()
plot_lead_ages <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1920), aes(x = startYear, fill=category, color=category)) +
geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) +
geom_line(aes(y=med_age)) +
scale_fill_brewer(palette="Set1") +
scale_color_brewer(palette="Set1") +
labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
x="Year Movie was Released",
y="Age of Lead Actor/Actress",
caption="Max Woolf — minimaxir.com",
fill='',
color='')
plot_lead_ages
Unused in post since a bit more complicated to explain and results need double-checking.
plot_gender_balance <- ggplot(df_ratings_movies %>% filter(startYear >= 1950), aes(x = startYear, fill=category)) +
geom_bar(position="fill", width=1) +
theme_minimal( base_size=9) +
scale_fill_brewer(palette="Set1") +
scale_color_brewer(palette="Set1")
plot_gender_balance
df_ratings_movies_nth <- df_ratings_movies %>%
group_by(nconst) %>%
arrange(startYear) %>%
mutate(nth_lead = row_number()) %>%
ungroup() %>%
arrange(desc(startYear), desc(numVotes))
df_ratings_movies_nth %>% select(primaryTitle, primaryName, nth_lead) %>% head(100)
df_actor_ages <- df_ratings_movies_nth %>%
group_by(startYear) %>%
summarize(low_nth = quantile(nth_lead, 0.25),
med_nth = quantile(nth_lead, 0.50),
high_nth = quantile(nth_lead, 0.75)) %>%
arrange(startYear)
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages %>% head()
plot_final <- ggplot(df_actor_ages %>% filter(startYear >= 1950) , aes(x = startYear)) +
geom_ribbon(aes(ymin=low_nth, ymax=high_nth), alpha=0.2) +
geom_line(aes(y=med_nth)) +
scale_y_continuous(breaks=c(1:5, 10)) +
labs(title="#th Time Lead Actor of Movie Was A Lead Actor, Over Time",
subtitle=sprintf("For %s Lead Actors. Line represents median #.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies_nth %>% filter(startYear >= 1950) %>% ppdf()),
x="Year",
y="#th Time Lead Actor was a Lead Actor",
caption="Max Woolf — minimaxir.com",
fill="# Movies") +
theme(panel.grid.minor = element_blank())
plot_final
!