Hollywood Age Gaps and Bechdel Tests

Question

Is there a correlation between age gaps and Bechdel ratings?
For this purpose: gap = man’s age - woman’s age

Set working directory ke folder lokal kamu

setwd("D:/ULM/Mata Kuliah/Komputasi Statistika I/Pertemuan 14/Data analysis - age gaps and bechdel tests")

Data Load

age_gaps <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl  (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bechdel <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-09/raw_bechdel.csv')
## Rows: 8839 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): imdb_id, title
## dbl (3): year, id, rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Orientation

ggplot(bechdel, aes(x = rating)) + geom_bar()

ggplot(age_gaps, aes(x = age_difference)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data Preparation

age_gaps <- age_gaps %>% 
  mutate(man_older = case_when(
    character_1_gender == "man" & character_2_gender == "woman" ~ actor_1_age - actor_2_age,
    character_2_gender == "man" & character_1_gender == "woman" ~ actor_2_age - actor_1_age
  ))

ggplot(age_gaps, aes(x = man_older)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_bin()`).

mean(age_gaps$man_older, na.rm = TRUE)
## [1] 8.568905

Join the Data

movies <- age_gaps %>% 
  left_join(bechdel, by = c("movie_name" = "title", "release_year" = "year"))
## Warning in left_join(., bechdel, by = c(movie_name = "title", release_year = "year")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 810 of `x` matches multiple rows in `y`.
## ℹ Row 1933 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Cek duplikat Wonder Woman
bechdel %>% filter(title == "Wonder Woman")
## # A tibble: 4 × 5
##    year    id imdb_id title        rating
##   <dbl> <dbl> <chr>   <chr>         <dbl>
## 1  2009  3585 1186373 Wonder Woman      3
## 2  2017  7241 0451279 Wonder Woman      3
## 3  2017  9294 <NA>    Wonder Woman      3
## 4  2017  9293 <NA>    Wonder Woman      3
# Hapus duplikat Wonder Woman
movies <- movies %>% filter(!(id %in% c(9293, 9294)))

# Cek film lain dengan data duplikat
bechdel %>% count(title, year) %>% filter(n > 1)
## # A tibble: 8 × 3
##   title                   year     n
##   <chr>                  <dbl> <int>
## 1 Ayneh                   1997     2
## 2 Dracula                 1931     2
## 3 Into the Woods          2014     2
## 4 Last Call at the Oasis  2011     2
## 5 Sleeping Beauty         1959     2
## 6 Terms of Endearment     1983     2
## 7 The Danish Girl         2015     2
## 8 Wonder Woman            2017     3

Visualization: Age Gap vs Bechdel Rating

movies <- movies %>% 
  mutate(rating_cat = as.factor(rating)) %>% 
  drop_na(man_older, rating)

ggplot(movies, aes(x = rating_cat, y = man_older)) + 
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(alpha = .2) + 
  scale_x_discrete(labels = c("Fail", "No conversation", "Only about men", "Pass")) +
  labs(x = "Bechdel test result", y = "Male age gap", title = "Something to talk about")

Correlation

cor(movies$man_older, movies$rating)
## [1] -0.1453132
cor(movies$man_older, movies$rating, method = "spearman")
## [1] -0.1508434

Regression Testing

model <- lm(man_older ~ rating_cat, data = movies)
summary(model)
## 
## Call:
## lm(formula = man_older ~ rating_cat, data = movies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.018  -6.216  -0.809   5.982  40.183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.241      1.869   4.946 9.33e-07 ***
## rating_cat1    1.567      2.011   0.779    0.436    
## rating_cat2   -0.424      2.091  -0.203    0.839    
## rating_cat3   -2.223      1.929  -1.153    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.06 on 768 degrees of freedom
## Multiple R-squared:  0.02433,    Adjusted R-squared:  0.02051 
## F-statistic: 6.383 on 3 and 768 DF,  p-value: 0.0002833