Exploring your favorite movies

Thomas E. Love

2021-09-21

Note: This document is published to https://rpubs.com/TELOVE/movies-eda-1

Setup

## Normally, I would not include this chunk of code
## in a final report

library(knitr)
library(rmdformats)

## Global options
options(max.print="75")
opts_chunk$set(comment=NA)
opts_knit$set(width=75)
library(googlesheets4) # loading in data from Google Sheet
library(glue)  # combining text and R results
library(ggrepel) # for geom_text_repel and geom_label_repel
library(janitor)
library(knitr)
library(magrittr)
library(naniar)
library(tidyverse)

theme_set(theme_bw())
mov21 <- read_sheet("https://docs.google.com/spreadsheets/d/1GHdGOZ7k20S--v6xewnyLx5vNJLDTtsok-R4AU1hvYY/edit#gid=1563354494")
mov21 <- mov21 %>% 
    select(film_id, film, year, length, imdb_categories, 
           imdb_ratings, imdb_stars)

mov21
miss_var_summary(mov21)

Are more recent movies more highly rated?

temp_cor <- mov21 %$% 
  cor(year, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = year, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = mov21 %>% slice_max(imdb_stars, n = 2)) +
    geom_text_repel(data = mov21 %>% slice_min(imdb_stars, n = 2)) +
    labs(x = "Year of Release",
         y = "Weighted Average IMDB Star Rating",
         title = "Are newer films more highly rated?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `year` is {temp_cor}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

What if we restrict to films since 1980?

mov21_recent <- mov21 %>%
  filter(year > 1979)

temp_cor2 <- mov21_recent %$% 
  cor(year, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21_recent, 
       aes(x = year, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(data = mov21_recent %>% 
                       slice_max(imdb_stars)) +
    geom_label_repel(data = mov21_recent %>% 
                       slice_min(imdb_stars)) +
    labs(x = "Year of Release",
         y = "Weighted Average IMDB Star Rating",
         title = "Among films since 1980, are newer films more highly rated?",
         subtitle = glue('Correlation is {temp_cor2} in films since 1980.'),
         caption = glue("{nrow(mov21_recent)} favorite films released since 1980"))

Are more recent movies longer?

temp_cor3 <- mov21 %$% 
  cor(year, length) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = year, y = length, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(data = mov21 %>% slice_min(year)) +
    labs(x = "Year of Release",
         y = "Length of Movie (in minutes)",
         title = "Are more recent movies longer?",
         subtitle = glue("Pearson correlation of `year` and `length` is {temp_cor3}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Do longer films have higher ratings?

temp_cor4 <- mov21 %$% 
  cor(length, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, 
       aes(x = length, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(
      data = mov21 %>% slice_max(length, n = 2)) +
    labs(x = "Length of Movie (in minutes)",
         y = "Weighted Average IMDB Star Rating",
         title = "Do longer films have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `length` is {temp_cor4}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

temp_cor4 <- mov21 %$% 
  cor(length, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = length, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = subset(mov21, imdb_stars < 6)) +
    labs(x = "Length of Movie (in minutes)",
         y = "Weighted Average IMDB Star Rating",
         title = "Do longer films have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `length` is {temp_cor4}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Are the “stars” and number of ratings strongly associated?

temp_cor5 <- mov21 %$% 
  cor(imdb_ratings, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = imdb_ratings, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = subset(mov21, imdb_stars < 6)) +
    geom_label_repel(data = mov21 %>% 
                        slice_max(imdb_ratings, n = 5)) +
    labs(x = "Number of IMDB Ratings",
         y = "Weighted Average IMDB Star Rating",
         title = "Do films rated more often have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `imdb_ratings` is {temp_cor5}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Which combinations of categories were most common?

ggplot(mov21, aes(x = factor(imdb_categories))) +
  geom_bar() +
  coord_flip()

mov21 %>% count(imdb_categories) %>% arrange(desc(n))
temp <- str_split_fixed(mov21$imdb_categories, ", ", 3)
colnames(temp) <- c("cat1", "cat2", "cat3")

temp_dat <- as_tibble(temp) 

temp_dat
temp_dat %>% tabyl(cat1)
temp_dat %>% tabyl(cat2)
temp_dat %>% tabyl(cat3)
mov21 <- mov21 %>%
  mutate(
    cat_Action = str_count(imdb_categories, "Action"),
    cat_Adventure = str_count(imdb_categories, "Adventure"),
    cat_Animation = str_count(imdb_categories, "Animation"),
    cat_Biography = str_count(imdb_categories, "Biography"),
    cat_Comedy = str_count(imdb_categories, "Comedy"),
    cat_Crime = str_count(imdb_categories, "Crime"),
    cat_Drama = str_count(imdb_categories, "Drama"),
    cat_Family = str_count(imdb_categories, "Family"),
    cat_Fantasy = str_count(imdb_categories, "Fantasy"),
    cat_History = str_count(imdb_categories, "History"),
    cat_Horror = str_count(imdb_categories, "Horror"),
    cat_Music = str_count(imdb_categories, "Music"),
    cat_Musical = str_count(imdb_categories, "Musical"),
    cat_Mystery = str_count(imdb_categories, "Mystery"),
    cat_Romance = str_count(imdb_categories, "Romance"),
    cat_Sci_Fi = str_count(imdb_categories, "Sci-Fi"),
    cat_Sport = str_count(imdb_categories, "Sport"),
    cat_Thriller = str_count(imdb_categories, "Thriller"),
    cat_War = str_count(imdb_categories, "War"),
    cat_Western = str_count(imdb_categories, "Western")
)
mov21 %>%
  summarize(across(.cols = starts_with("cat_"), ~ sum(.x)))
mov21 %>%
  summarize(across(.cols = starts_with("cat_"), 
                   ~ sum(.x))) %>%
  pivot_longer(cols = everything(), 
               names_to = "category", 
               values_to = "count") %>%
  arrange(desc(count))
ggplot(mov21, aes(x = factor(cat_Drama), y = imdb_stars)) +
  geom_violin(aes(fill = factor(cat_Drama))) +
  geom_boxplot(width = 0.3, notch = TRUE) +
  guides(fill = "none") +
  labs(title = "Drama favorites had slightly higher IMDB star ratings than non-Dramas")

ggplot(mov21, aes(x = factor(cat_Comedy), y = imdb_stars)) +
  geom_violin(aes(fill = factor(cat_Comedy))) +
  geom_boxplot(width = 0.3, notch = TRUE) +
  guides(fill = "none") +
  labs(title = "Comedy favorites had lower IMDB star ratings")