We’ll start by loading the data
library(tidyverse)
library(magrittr)
t1 <- "https://github.com/thomasjwood/code_lab/raw/main/data/box_office_jan_97_feb_24.rds" %>%
url %>%
readRDS
As we discussed in lab, we need to compare films released by year, but include the film days that occur in subsequent years (ie, we need to include those days of release which fall after the new year.)
So we’ll build a new variable, rel_year
, which indicates
the year of release for each film, rather than the specific year which
relates to a specific row
t1 %<>%
left_join(
t1 %>%
arrange(
id, title, date
) %>%
group_by(
id, title
) %>%
slice(1) %>%
mutate(
rel_year = date %>% year
) %>%
select(
id, title, rel_year
)
)
Now we’ll calculate the highest grossing film by
rel_year
t2 <- t1 %>%
filter(
rel_year %>%
is_in(
2018:2023
)
) %>%
group_by(
rel_year, id, title,
) %>%
summarize(
tote = daily_adj %>% sum
) %>%
arrange(desc(tote)) %>%
group_by(rel_year) %>%
slice(1) %>%
mutate(kr = "keep") %>%
right_join(
t1
) %>%
filter(
kr %>% equals("keep")
)
t2 %>%
ggplot(
aes(
day, to_date_adj, color = title
)
) +
geom_step() +
geom_text(
aes(
label = title
),
size = 3,
hjust = 0,
data = t2 %>%
group_by(title) %>%
arrange(title, day) %>%
slice(n()),
position = position_nudge(x = 4)
) +
theme_minimal() +
theme(
legend.position = "none"
) +
scale_x_continuous(
expand = expansion(add = c(10, 70))
) +
scale_y_continuous(
labels = scales::dollar_format()
) +
labs(
x = "Days in Release",
y = "Inlation adjusted domestic box office",
title = "Cumulative box office for highest grossing movies by year, 2018-2023",
)
lubridate::yday()
might be useful.)So before we make the graph, we’re going to need a table where every row looks like this:
year | day_of_year | tote_box | cume_box |
---|---|---|---|
2018 | 1 | 65 | 65 |
2018 | 2 | 55 | 120 |
2018 | 3 | 30 | 150 |
2018 | 4 | 40 | 190 |
… | … | … | … |
2019 | 1 | 65 | 65 |
t3 <- t1 %>%
mutate(
year = date %>% year,
yday = date %>% yday
) %>%
filter(
year %>%
is_in(
2018:2023
)
) %>%
group_by(
year, yday
) %>%
summarize(tote_box = daily_adj %>% sum) %>%
mutate(
cume_box = tote_box %>% cumsum
)
Which should return
## # A tibble: 2,120 × 4
## # Groups: year [6]
## year yday tote_box cume_box
## <dbl> <dbl> <dbl> <dbl>
## 1 2018 1 68210765. 68210765.
## 2 2018 2 42445248. 110656013.
## 3 2018 3 29659573. 140315586.
## 4 2018 4 25314846. 165630432.
## 5 2018 5 60657252. 226287684.
## 6 2018 6 79558766. 305846450.
## 7 2018 7 50179008. 356025458.
## 8 2018 8 15049315. 371074773.
## 9 2018 9 20121874. 391196647.
## 10 2018 10 14362471. 405559119.
## # ℹ 2,110 more rows
This looks like we’re ready to plot!
t3 %>%
ggplot(
aes(
x = yday,
y = cume_box,
color = year %>% factor
)
) +
geom_step() +
geom_text(
aes(
label = year
),
hjust = 0,
data = t3 %>%
arrange(year, yday) %>%
slice(n()),
position = position_nudge(5)
) +
scale_x_continuous(
expand = expansion(add = c(10, 30)),
breaks = seq(0, 350, by = 50)
) +
scale_y_continuous(
breaks = c(
0, 5e9, 10e9
),
labels = c(
"0", "$5bn", "$10bn"
)
) +
labs(
title = "Cumulative annual box office, 2018-2023",
x = "Day of Year",
y = "Cumulative industry-wide box office"
) +
theme_minimal() +
theme(
legend.position = "none"
)
First we’ll need to recode the studio indicator
t4 <- t1 %>%
ungroup %>%
select(
date, daily_adj, studio
) %>%
mutate(
studio2 = studio %>%
case_match(
"Warner Bros." ~ "Warner Bros.",
"Universal Pictures" ~ "Universal Pictures",
c("Twentieth Century Fox",
"Fox Searchlight",
"Walt Disney Studios Motion Pictures") ~ "Disney",
"Paramount Pictures" ~ "Paramount Pictures",
"Sony Pictures Releasing" ~ "Sony",
.default = "Other"
),
year = date %>% year
) %>%
filter(
year %>%
is_in(2018:2023)
) %>%
group_by(year, studio2) %>%
summarize(
tote = daily_adj %>% sum
) %>%
mutate(
perc = tote %>%
divide_by(
tote %>% sum
) %>%
multiply_by(100)
) %>%
ungroup %>%
mutate(
studio2 = studio2 %>%
fct_reorder(perc, .desc = T) %>%
fct_relevel("Other", after = Inf)
)
t4 %>%
ggplot(
aes(
x = year %>% factor,
y = perc,
fill = studio2
)
) +
geom_col(
position = position_stack(),
color = "grey10"
) +
geom_text(
aes(
label = perc %>% round,
group = studio2
),
position = position_stack(vjust = .5),
data = t4 %>%
filter(
perc > 5
)
) +
scale_y_continuous(breaks = NULL) +
scale_fill_brewer(palette = "RdBu") +
labs(
x = "",
y = "",
fill = "",
title = "Studios' share of the domestic box office, 2018:2023"
) +
theme_minimal() +
theme(
panel.grid = element_blank()
)