Note: This document is published to https://rpubs.com/TELOVE/movies-eda-1

Setup

## Normally, I would not include this chunk of code
## in a final report

library(knitr)
library(rmdformats)

## Global options
options(max.print="75")
opts_chunk$set(comment=NA)
opts_knit$set(width=75)

library(googlesheets4) # loading in data from Google Sheet
library(glue)  # combining text and R results
library(ggrepel) # for geom_text_repel and geom_label_repel
library(janitor)
library(knitr)
library(magrittr)
library(naniar)
library(tidyverse)

theme_set(theme_bw())

mov21 <- read_sheet("https://docs.google.com/spreadsheets/d/1GHdGOZ7k20S--v6xewnyLx5vNJLDTtsok-R4AU1hvYY/edit#gid=1563354494")

mov21 <- mov21 %>% 
    select(film_id, film, year, length, imdb_categories, 
           imdb_ratings, imdb_stars)

mov21

miss_var_summary(mov21)

Are more recent movies more highly rated?

temp_cor <- mov21 %$% 
  cor(year, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = year, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = mov21 %>% slice_max(imdb_stars, n = 2)) +
    geom_text_repel(data = mov21 %>% slice_min(imdb_stars, n = 2)) +
    labs(x = "Year of Release",
         y = "Weighted Average IMDB Star Rating",
         title = "Are newer films more highly rated?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `year` is {temp_cor}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

What if we restrict to films since 1980?

mov21_recent <- mov21 %>%
  filter(year > 1979)

temp_cor2 <- mov21_recent %$% 
  cor(year, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21_recent, 
       aes(x = year, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(data = mov21_recent %>% 
                       slice_max(imdb_stars)) +
    geom_label_repel(data = mov21_recent %>% 
                       slice_min(imdb_stars)) +
    labs(x = "Year of Release",
         y = "Weighted Average IMDB Star Rating",
         title = "Among films since 1980, are newer films more highly rated?",
         subtitle = glue('Correlation is {temp_cor2} in films since 1980.'),
         caption = glue("{nrow(mov21_recent)} favorite films released since 1980"))

Are more recent movies longer?

temp_cor3 <- mov21 %$% 
  cor(year, length) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = year, y = length, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(data = mov21 %>% slice_min(year)) +
    labs(x = "Year of Release",
         y = "Length of Movie (in minutes)",
         title = "Are more recent movies longer?",
         subtitle = glue("Pearson correlation of `year` and `length` is {temp_cor3}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Do longer films have higher ratings?

temp_cor4 <- mov21 %$% 
  cor(length, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, 
       aes(x = length, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_label_repel(
      data = mov21 %>% slice_max(length, n = 2)) +
    labs(x = "Length of Movie (in minutes)",
         y = "Weighted Average IMDB Star Rating",
         title = "Do longer films have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `length` is {temp_cor4}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

temp_cor4 <- mov21 %$% 
  cor(length, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = length, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = subset(mov21, imdb_stars < 6)) +
    labs(x = "Length of Movie (in minutes)",
         y = "Weighted Average IMDB Star Rating",
         title = "Do longer films have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `length` is {temp_cor4}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Are the “stars” and number of ratings strongly associated?

temp_cor5 <- mov21 %$% 
  cor(imdb_ratings, imdb_stars) %>% 
  round_half_up(., digits = 3)

ggplot(mov21, aes(x = imdb_ratings, y = imdb_stars, label = film)) +
    geom_point() + 
    geom_smooth(method = "loess", col = "blue",
                formula = y ~ x, se = FALSE) +
    geom_smooth(method = "lm", col = "red",
                formula = y ~ x, se = FALSE) +
    geom_text_repel(data = subset(mov21, imdb_stars < 6)) +
    geom_label_repel(data = mov21 %>% 
                        slice_max(imdb_ratings, n = 5)) +
    labs(x = "Number of IMDB Ratings",
         y = "Weighted Average IMDB Star Rating",
         title = "Do films rated more often have higher ratings?",
         subtitle = glue("Pearson correlation of `imdb_stars` and `imdb_ratings` is {temp_cor5}."),
         caption = "115 films mentioned as favorites by 431 students in 2020 and in 2021")

Which combinations of categories were most common?

ggplot(mov21, aes(x = factor(imdb_categories))) +
  geom_bar() +
  coord_flip()

mov21 %>% count(imdb_categories) %>% arrange(desc(n))

temp <- str_split_fixed(mov21$imdb_categories, ", ", 3)
colnames(temp) <- c("cat1", "cat2", "cat3")

temp_dat <- as_tibble(temp) 

temp_dat

temp_dat %>% tabyl(cat1)

temp_dat %>% tabyl(cat2)

temp_dat %>% tabyl(cat3)

mov21 <- mov21 %>%
  mutate(
    cat_Action = str_count(imdb_categories, "Action"),
    cat_Adventure = str_count(imdb_categories, "Adventure"),
    cat_Animation = str_count(imdb_categories, "Animation"),
    cat_Biography = str_count(imdb_categories, "Biography"),
    cat_Comedy = str_count(imdb_categories, "Comedy"),
    cat_Crime = str_count(imdb_categories, "Crime"),
    cat_Drama = str_count(imdb_categories, "Drama"),
    cat_Family = str_count(imdb_categories, "Family"),
    cat_Fantasy = str_count(imdb_categories, "Fantasy"),
    cat_History = str_count(imdb_categories, "History"),
    cat_Horror = str_count(imdb_categories, "Horror"),
    cat_Music = str_count(imdb_categories, "Music"),
    cat_Musical = str_count(imdb_categories, "Musical"),
    cat_Mystery = str_count(imdb_categories, "Mystery"),
    cat_Romance = str_count(imdb_categories, "Romance"),
    cat_Sci_Fi = str_count(imdb_categories, "Sci-Fi"),
    cat_Sport = str_count(imdb_categories, "Sport"),
    cat_Thriller = str_count(imdb_categories, "Thriller"),
    cat_War = str_count(imdb_categories, "War"),
    cat_Western = str_count(imdb_categories, "Western")
)

mov21 %>%
  summarize(across(.cols = starts_with("cat_"), ~ sum(.x)))

mov21 %>%
  summarize(across(.cols = starts_with("cat_"), 
                   ~ sum(.x))) %>%
  pivot_longer(cols = everything(), 
               names_to = "category", 
               values_to = "count") %>%
  arrange(desc(count))

ggplot(mov21, aes(x = factor(cat_Drama), y = imdb_stars)) +
  geom_violin(aes(fill = factor(cat_Drama))) +
  geom_boxplot(width = 0.3, notch = TRUE) +
  guides(fill = "none") +
  labs(title = "Drama favorites had slightly higher IMDB star ratings than non-Dramas")

ggplot(mov21, aes(x = factor(cat_Comedy), y = imdb_stars)) +
  geom_violin(aes(fill = factor(cat_Comedy))) +
  geom_boxplot(width = 0.3, notch = TRUE) +
  guides(fill = "none") +
  labs(title = "Comedy favorites had lower IMDB star ratings")

Exploring your favorite movies

Thomas E. Love

2021-09-21

Setup

Are more recent movies more highly rated?

What if we restrict to films since 1980?

Are more recent movies longer?

Do longer films have higher ratings?

Are the “stars” and number of ratings strongly associated?

Which combinations of categories were most common?