library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggridges)   # unused in final blog post
library(scales,warn.conflicts = FALSE)
library(tidytext)   # unused in final blog post


sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidytext_0.3.1  scales_1.1.1    ggridges_0.5.3  forcats_0.5.0  
##  [5] stringr_1.4.0   dplyr_1.0.2     purrr_0.3.4     readr_1.4.0    
##  [9] tidyr_1.1.2     tibble_3.0.4    ggplot2_3.3.3   tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0  xfun_0.19         lattice_0.20-41   haven_2.3.1      
##  [5] colorspace_2.0-0  vctrs_0.3.5       generics_0.1.0    SnowballC_0.7.0  
##  [9] htmltools_0.5.0   yaml_2.2.1        rlang_0.4.10      pillar_1.4.7     
## [13] glue_1.4.2        withr_2.3.0       DBI_1.1.0         dbplyr_2.0.0     
## [17] modelr_0.1.8      readxl_1.3.1      lifecycle_0.2.0   plyr_1.8.6       
## [21] munsell_0.5.0     gtable_0.3.0      cellranger_1.1.0  rvest_0.3.6      
## [25] evaluate_0.14     knitr_1.30        fansi_0.4.1       tokenizers_0.2.1 
## [29] broom_0.7.2       Rcpp_1.0.6        backports_1.2.1   jsonlite_1.7.2   
## [33] fs_1.5.0          hms_0.5.3         digest_0.6.27     stringi_1.5.3    
## [37] grid_4.0.3        cli_2.2.0         tools_4.0.3       magrittr_2.0.1   
## [41] janeaustenr_0.1.5 crayon_1.3.4      pkgconfig_2.0.3   Matrix_1.2-18    
## [45] ellipsis_0.3.1    xml2_1.3.2        reprex_0.3.0      lubridate_1.7.9.2
## [49] assertthat_0.2.1  rmarkdown_2.6     httr_1.4.2        rstudioapi_0.13  
## [53] R6_2.5.0          compiler_4.0.3

Helper function to read IMDB files given filename.

read_imdb <- function(data_path) {
  path <- "/Users/ankurhusky71/Desktop/IMDB_datasets/"
  read_tsv(paste0(path, data_path), na = "\\N", quote='', progress=F)
}

Helper function to pretty print the size of a dataframe for charts/notebook.

ppdf <- function(df) {
  df %>% nrow() %>% comma()
}

Ratings

df_ratings <- read_imdb("title.ratings.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   averageRating = col_double(),
##   numVotes = col_double()
## )
df_ratings %>% head()

Plot a 2D histogram and clean up axes.

plot2D_histogram <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) +
          geom_bin2d() +
          scale_x_log10(labels = comma) +
          scale_y_continuous(breaks = 1:10) +
          scale_fill_viridis_c(labels = comma)
plot2D_histogram

Title Basics

df_basics <- read_imdb("title.basics.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   titleType = col_character(),
##   primaryTitle = col_character(),
##   originalTitle = col_character(),
##   isAdult = col_double(),
##   startYear = col_double(),
##   endYear = col_logical(),
##   runtimeMinutes = col_double(),
##   genres = col_character()
## )
## Warning: 75427 parsing failures.
##   row     col           expected actual                                                         file
## 34983 endYear 1/0/T/F/TRUE/FALSE   1947 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 35184 endYear 1/0/T/F/TRUE/FALSE   1945 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 37613 endYear 1/0/T/F/TRUE/FALSE   1955 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 38447 endYear 1/0/T/F/TRUE/FALSE   1949 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## 38448 endYear 1/0/T/F/TRUE/FALSE   1949 '/Users/ankurhusky71/Desktop/IMDB_datasets/title.basics.tsv'
## ..... ....... .................. ...... ............................................................
## See problems(...) for more details.
df_basics %>% head()

There are 7,824,174 titles in the dataset.

Merge df_ratings and df_basics to perform ratings/vote analysis using more metadata.

df Ratings

df ratings left join with df_basics

df_ratings <- df_ratings %>% left_join(df_basics)
## Joining, by = "tconst"
df_ratings %>% head()

Which movies have the superhigh runtimes?

df_ratings %>% arrange(desc(runtimeMinutes)) %>% head(10)
plot_movie_runtime <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10), aes(x = runtimeMinutes, y = averageRating)) +
          geom_bin2d() +
          scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
          scale_y_continuous(breaks = 0:10) +
          scale_fill_viridis_c(option = "inferno", labels = comma) +
          # theme_minimal(base_family = "Source Sans Pro", base_size=8) +
          labs(title="Relationship between Movie Runtime and Average Movie Rating",
               subtitle="Data from IMDb retrieved July 4th, 2018",
               x="Runtime (Hours)",
               y="Average User Rating",
               caption="Max Woolf — minimaxir.com",
               fill="# Movies")

Movie Runtime Versus Runtime Hours

The color of the cell represents the Number of Movies

plot_movie_runtime

Rating vs. Movie Year

Set theme to custom theme based on theme_minimal for the rest of the notebook.

theme_set(theme_minimal(base_size=9) +
            theme(plot.title = element_text(size=8, margin=margin(t = -0.1, b = 0.1, unit='cm')),
                  axis.title.x = element_text(size=8),
                  axis.title.y = element_text(size=8),
                  plot.subtitle = element_text(color="#969696", size=6),
                  plot.caption = element_text(size=6, color="#969696"),
                  legend.title = element_text(size=8),
                  legend.key.width = unit(0.25, unit='cm')))
plot_ratings_vs_movieYear <- ggplot(df_ratings %>% filter(titleType=="movie", numVotes >= 10), aes(x = startYear, y = averageRating)) +
          geom_bin2d() +
          geom_smooth(color="black") +
          scale_x_continuous() +
          scale_y_continuous(breaks = 1:10) +
          scale_fill_viridis_c(option = "plasma", labels = comma, trans='log10') +
          labs(title="Relationship between Movie Release Year and Average Rating",
               subtitle=sprintf("For %s Movies/Ratings. Data from IMDb retrieved 7/4/2018", df_ratings %>% filter(titleType=="movie", numVotes >= 10) %>% ppdf),
               x="Year Movie was Released",
               y="Average User Rating For Movie",
               caption="Max Woolf — minimaxir.com",
               fill="# Movies")

Ratings Vs Year Movie was released

1.The x axis represents the year in which Movie was released 2.The Y represents the Average Rating 3.The Color Represents the Number of Such Movies

plot_ratings_vs_movieYear
## Warning: Removed 22 rows containing non-finite values (stat_bin2d).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 22 rows containing non-finite values (stat_smooth).

plot_ridge <- ggplot(df_ratings %>% filter(startYear >= 2000, titleType=="movie",  numVotes >= 10) %>% mutate(startYear = factor(startYear)), aes(x = averageRating, y = startYear, fill=startYear)) +
          geom_density_ridges() +
          scale_fill_hue(guide=F) +
          scale_x_continuous(breaks = 1:10) +
          theme_minimal( base_size=9)

Ridge Plot

Average Ratings Vs Start Year

plot_ridge
## Picking joint bandwidth of 0.261

Bucket by decades.

Episode Analysis// df_episode

df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   parentTconst = col_character(),
##   seasonNumber = col_double(),
##   episodeNumber = col_double()
## )
df_episode %>% head()
#df_episode_count <- df_episode %>%
#                group_by(parentTconst, seasonNumber) %>%
#                tally() %>%
#                left_join(df_basics,  c("parentTconst" = "tconst"))

#df_episode_count %>% head()

Actor Information // df_actors

str_detect is vectorized and much faster than using a loop/lapply. Using a regular expression to search for actor or actress is another speed increase.

df_actors <- read_imdb("name.basics.tsv") %>%
                filter(str_detect(primaryProfession, "actor|actress"))  %>%
                select(nconst, primaryName, birthYear)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   nconst = col_character(),
##   primaryName = col_character(),
##   birthYear = col_double(),
##   deathYear = col_double(),
##   primaryProfession = col_character(),
##   knownForTitles = col_character()
## )
df_actors %>% head()

There are 4,010,259 actors in the dataset. # df_principals

df_principals <- read_imdb("title.principals.tsv") %>%
  filter(str_detect(category, "actor|actress")) %>%
  select(tconst, ordering, nconst, category) %>%
  group_by(tconst) %>%
  filter(ordering == min(ordering))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   tconst = col_character(),
##   ordering = col_double(),
##   nconst = col_character(),
##   category = col_character(),
##   job = col_character(),
##   characters = col_character()
## )
df_principals %>% head()

There are 4,389,097 principals/rows in the dataset.

Join the 2 dataframes. (onto principals, since Many-to-One)

df_principals <- df_principals %>% left_join(df_actors)
## Joining, by = "nconst"
df_principals %>% head()

Putting It All Together

Merge actor information onto the full ratings dataframe.

df_ratings <- df_ratings %>% left_join(df_principals)
## Joining, by = "tconst"
df_ratings %>% head()

Filter down to movies w/ actor info. (only if the birth year is present in the data)

df_ratings_movies <- df_ratings %>%
                        filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
                        mutate(age_lead = startYear - birthYear) %>%
                        arrange(desc(numVotes))

df_ratings_movies %>% head(100)

Aggregate lead-actor/actress ages by movie year w/ percentiles.

df_actor_ages <- df_ratings_movies %>%
                  group_by(startYear) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
                            med_age = quantile(age_lead, 0.50, na.rm=T),
                            high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
                  arrange(startYear)
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages %>% head()

Create a ribbon plot.

NB: Plot the ribbon before the line, so the line is on top.

plot_ages <- ggplot(df_actor_ages %>% filter(startYear >= 1920) , aes(x = startYear)) +
          geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2) +
          geom_line(aes(y=med_age)) +
          labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
               subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
               x="Year Movie was Released",
               y="Age of Lead Actor/Actress",
               caption="Max Woolf — minimaxir.com",
               fill="# Movies")

Age of the Lead Actor / Actress Vs Year the Movie Was Released

plot_ages

Create a plot comparing actors/actresses.

df_actor_ages_lead <- df_ratings_movies %>%
                  group_by(startYear, category) %>%
                  summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
                            med_age = quantile(age_lead, 0.50, na.rm=T),
                            high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
                  arrange(startYear)
## `summarise()` regrouping output by 'startYear' (override with `.groups` argument)
df_actor_ages_lead %>% head()
plot_lead_ages <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1920), aes(x = startYear, fill=category, color=category)) +
          geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) +
          geom_line(aes(y=med_age)) +
          scale_fill_brewer(palette="Set1") +
          scale_color_brewer(palette="Set1") +
          labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
               subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
               x="Year Movie was Released",
               y="Age of Lead Actor/Actress",
               caption="Max Woolf — minimaxir.com",
               fill='',
               color='')

Color Chart// Age of the Lead Actor / Actress Vs Year the Movie Was Released

plot_lead_ages

Lead Gender Balance

Unused in post since a bit more complicated to explain and results need double-checking.

plot_gender_balance <- ggplot(df_ratings_movies %>% filter(startYear >= 1950), aes(x = startYear, fill=category)) +
          geom_bar(position="fill", width=1) +
          theme_minimal( base_size=9) +
          scale_fill_brewer(palette="Set1") +
          scale_color_brewer(palette="Set1")

Gender Balance Vs The Year In Which the Movie Was Released

plot_gender_balance

nth time lead

df_ratings_movies_nth <- df_ratings_movies %>%
                      group_by(nconst) %>%
                      arrange(startYear) %>%
                      mutate(nth_lead = row_number()) %>%
                      ungroup() %>%
                      arrange(desc(startYear), desc(numVotes))

df_ratings_movies_nth %>% select(primaryTitle, primaryName, nth_lead) %>% head(100)
df_actor_ages <- df_ratings_movies_nth %>%
                  group_by(startYear) %>%
                  summarize(low_nth = quantile(nth_lead, 0.25),
                            med_nth = quantile(nth_lead, 0.50),
                            high_nth = quantile(nth_lead, 0.75)) %>%
                  arrange(startYear)
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages %>% head()
plot_final <- ggplot(df_actor_ages %>% filter(startYear >= 1950) , aes(x = startYear)) +
          geom_ribbon(aes(ymin=low_nth, ymax=high_nth), alpha=0.2) +
          geom_line(aes(y=med_nth)) +
          scale_y_continuous(breaks=c(1:5, 10)) +
          labs(title="#th Time Lead Actor of Movie Was A Lead Actor, Over Time",
               subtitle=sprintf("For %s Lead Actors. Line represents median #.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies_nth %>% filter(startYear >= 1950) %>% ppdf()),
               x="Year",
               y="#th Time Lead Actor was a Lead Actor",
               caption="Max Woolf — minimaxir.com",
               fill="# Movies") +
          theme(panel.grid.minor = element_blank())
plot_final

!