Answers to Graphics Lab exercises

1. Take the top grossing film released in every year between 2018 and 2023. For each film, plot the cumulative adjusted box office against days in release.

We’ll start by loading the data

library(tidyverse)
library(magrittr)

t1 <- "https://github.com/thomasjwood/code_lab/raw/main/data/box_office_jan_97_feb_24.rds" %>% 
  url %>% 
  readRDS

As we discussed in lab, we need to compare films released by year, but include the film days that occur in subsequent years (ie, we need to include those days of release which fall after the new year.)

So we’ll build a new variable, rel_year, which indicates the year of release for each film, rather than the specific year which relates to a specific row

t1 %<>% 
  left_join(
    t1 %>% 
      arrange(
        id, title, date
      ) %>% 
      group_by(
        id, title
      ) %>% 
      slice(1) %>% 
      mutate(
        rel_year = date %>% year
      ) %>% 
      select(
        id, title, rel_year
      )
  )

Now we’ll calculate the highest grossing film by rel_year

t2 <- t1  %>%  
  filter(
    rel_year %>% 
      is_in(
        2018:2023
      )
  ) %>% 
  group_by(
    rel_year, id, title, 
  ) %>% 
  summarize(
    tote = daily_adj %>% sum
  ) %>% 
  arrange(desc(tote)) %>% 
  group_by(rel_year) %>% 
  slice(1) %>% 
  mutate(kr = "keep") %>% 
  right_join(
    t1
  ) %>% 
  filter(
    kr %>% equals("keep")
  ) 

t2 %>% 
  ggplot(
    aes(
      day, to_date_adj, color = title
    )
  ) +
  geom_step() +
  geom_text(
    aes(
      label = title
    ),
    size = 3, 
    hjust = 0,
    data = t2 %>% 
      group_by(title) %>% 
      arrange(title, day) %>% 
      slice(n()),
    position = position_nudge(x = 4)
  ) +
  theme_minimal() +
  theme(
    legend.position = "none"
  ) +
  scale_x_continuous(
    expand = expansion(add = c(10, 70))
  ) +
  scale_y_continuous(
    labels = scales::dollar_format()
  ) +
  labs(
    x = "Days in Release",
    y = "Inlation adjusted domestic box office",
    title = "Cumulative box office for highest grossing movies by year, 2018-2023",
    
  )

2. Plot the cumulative annual box office for every year between 2018 and 2023.(Hint: the function lubridate::yday() might be useful.)

So before we make the graph, we’re going to need a table where every row looks like this:

year day_of_year tote_box cume_box
2018 1 65 65
2018 2 55 120
2018 3 30 150
2018 4 40 190
2019 1 65 65
t3 <- t1 %>% 
  mutate(
    year = date %>% year, 
    yday = date %>% yday
  ) %>% 
  filter(
    year %>% 
      is_in(
        2018:2023
      )
  ) %>% 
  group_by(
    year, yday
  ) %>% 
  summarize(tote_box = daily_adj %>% sum) %>% 
  mutate(
    cume_box = tote_box %>% cumsum
  )

Which should return

## # A tibble: 2,120 × 4
## # Groups:   year [6]
##     year  yday  tote_box   cume_box
##    <dbl> <dbl>     <dbl>      <dbl>
##  1  2018     1 68210765.  68210765.
##  2  2018     2 42445248. 110656013.
##  3  2018     3 29659573. 140315586.
##  4  2018     4 25314846. 165630432.
##  5  2018     5 60657252. 226287684.
##  6  2018     6 79558766. 305846450.
##  7  2018     7 50179008. 356025458.
##  8  2018     8 15049315. 371074773.
##  9  2018     9 20121874. 391196647.
## 10  2018    10 14362471. 405559119.
## # ℹ 2,110 more rows

This looks like we’re ready to plot!

t3 %>% 
  ggplot(
    aes(
      x = yday,
      y =  cume_box, 
      color = year %>% factor
    )
  ) +
  geom_step() +
  geom_text(
    aes(
      label = year
    ),
    hjust = 0,
    data = t3 %>% 
      arrange(year, yday) %>% 
      slice(n()),
    position = position_nudge(5)
  ) +
  scale_x_continuous(
    expand = expansion(add = c(10, 30)),
    breaks = seq(0, 350, by = 50)
    ) +
  scale_y_continuous(
    breaks = c(
      0, 5e9, 10e9
    ),
    labels = c(
      "0", "$5bn", "$10bn"
    )
  ) +
  labs(
    title = "Cumulative annual box office, 2018-2023",
    x = "Day of Year",
    y = "Cumulative industry-wide box office"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none"
  )

3. Take the big 5 studios (Disney, Universal, Paramount, Warner Bros., Sony) and report their annual studio box office as a percentage of total receipts for the years 2018 though 2023. Group together all other studios in an “Other” category. Plot these percentages as a stacked bar, by year.

First we’ll need to recode the studio indicator

t4 <- t1 %>% 
  ungroup %>%
  select(
    date, daily_adj, studio
    ) %>% 
  mutate(
    studio2 = studio %>% 
      case_match(
        "Warner Bros." ~ "Warner Bros.",
        "Universal Pictures" ~ "Universal Pictures",
        c("Twentieth Century Fox",
          "Fox Searchlight",
          "Walt Disney Studios Motion Pictures") ~ "Disney",
        "Paramount Pictures" ~ "Paramount Pictures",
        "Sony Pictures Releasing" ~ "Sony", 
        .default = "Other"
        ),
    year = date %>% year
    ) %>% 
  filter(
    year %>% 
      is_in(2018:2023)
  ) %>% 
  group_by(year, studio2) %>% 
  summarize(
    tote = daily_adj %>% sum
    ) %>% 
  mutate(
    perc = tote %>% 
      divide_by(
        tote %>% sum
        ) %>% 
      multiply_by(100)
    ) %>% 
  ungroup %>% 
  mutate(
    studio2 = studio2 %>% 
      fct_reorder(perc, .desc = T) %>% 
      fct_relevel("Other", after = Inf)
    )

t4 %>% 
  ggplot(
    aes(
      x = year %>% factor,
      y = perc,
      fill = studio2
    )
  ) +
  geom_col(
    position = position_stack(),
    color = "grey10"
  ) +
  geom_text(
    aes(
      label = perc %>% round,
      group = studio2
      ),
    position = position_stack(vjust = .5),
    data = t4 %>% 
      filter(
        perc > 5
      )
   ) +
  scale_y_continuous(breaks = NULL) +
  scale_fill_brewer(palette = "RdBu") +
  labs(
    x = "",
    y = "",
    fill = "",
    title = "Studios' share of the domestic box office, 2018:2023"
  ) +
  theme_minimal() +
  theme(
    panel.grid = element_blank()
  )