Data team: powers activate! Any other cool analysis here? @drob @kevinmontrose @fody @JasonPunyon
— CAseyA (@cashnwho) December 13, 2016
library(tidyverse)
library(scales)
theme_set(theme_bw())
characters <- read_csv("https://raw.githubusercontent.com/matthewfdaniels/scripts/graphs/character_list5.csv")
movies <- read_csv("https://raw.githubusercontent.com/matthewfdaniels/scripts/graphs/meta_data7.csv")
characters %>%
filter(!is.na(words)) %>%
count(script_id, gender, wt = words) %>%
spread(gender, n, fill = 0) %>%
mutate(percent_female = f / (m + f)) %>%
inner_join(movies, by = "script_id") %>%
group_by(year) %>%
summarize(average = mean(percent_female),
total_movies = n_distinct(script_id)) %>%
filter(total_movies >= 10) %>%
ggplot(aes(year, average)) +
geom_line() +
scale_y_continuous(labels = percent_format()) +
expand_limits(y = 0) +
ylab("Average % of dialogue spoken by women") +
ggtitle("% of dialogue spoken by women over time",
subtitle = "Among years with 10+ movies in data")