Data team: powers activate! Any other cool analysis here? @drob @kevinmontrose @fody @JasonPunyon

— CAseyA (@cashnwho) December 13, 2016
library(tidyverse)
library(scales)
theme_set(theme_bw())

characters <- read_csv("https://raw.githubusercontent.com/matthewfdaniels/scripts/graphs/character_list5.csv")
movies <- read_csv("https://raw.githubusercontent.com/matthewfdaniels/scripts/graphs/meta_data7.csv")

characters %>%
  filter(!is.na(words)) %>%
  count(script_id, gender, wt = words) %>%
  spread(gender, n, fill = 0) %>%
  mutate(percent_female = f / (m + f)) %>%
  inner_join(movies, by = "script_id") %>%
  group_by(year) %>%
  summarize(average = mean(percent_female),
            total_movies = n_distinct(script_id)) %>%
  filter(total_movies >= 10) %>%
  ggplot(aes(year, average)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  expand_limits(y = 0) +
  ylab("Average % of dialogue spoken by women") +
  ggtitle("% of dialogue spoken by women over time",
          subtitle = "Among years with 10+ movies in data")