Is there a correlation between age gaps and Bechdel ratings?
For this purpose: gap = man’s age - woman’s age
setwd("D:/ULM/Mata Kuliah/Komputasi Statistika I/Pertemuan 14/Data analysis - age gaps and bechdel tests")
age_gaps <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bechdel <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-09/raw_bechdel.csv')
## Rows: 8839 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): imdb_id, title
## dbl (3): year, id, rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(bechdel, aes(x = rating)) + geom_bar()
ggplot(age_gaps, aes(x = age_difference)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
age_gaps <- age_gaps %>%
mutate(man_older = case_when(
character_1_gender == "man" & character_2_gender == "woman" ~ actor_1_age - actor_2_age,
character_2_gender == "man" & character_1_gender == "woman" ~ actor_2_age - actor_1_age
))
ggplot(age_gaps, aes(x = man_older)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_bin()`).
mean(age_gaps$man_older, na.rm = TRUE)
## [1] 8.568905
movies <- age_gaps %>%
left_join(bechdel, by = c("movie_name" = "title", "release_year" = "year"))
## Warning in left_join(., bechdel, by = c(movie_name = "title", release_year = "year")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 810 of `x` matches multiple rows in `y`.
## ℹ Row 1933 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Cek duplikat Wonder Woman
bechdel %>% filter(title == "Wonder Woman")
## # A tibble: 4 × 5
## year id imdb_id title rating
## <dbl> <dbl> <chr> <chr> <dbl>
## 1 2009 3585 1186373 Wonder Woman 3
## 2 2017 7241 0451279 Wonder Woman 3
## 3 2017 9294 <NA> Wonder Woman 3
## 4 2017 9293 <NA> Wonder Woman 3
# Hapus duplikat Wonder Woman
movies <- movies %>% filter(!(id %in% c(9293, 9294)))
# Cek film lain dengan data duplikat
bechdel %>% count(title, year) %>% filter(n > 1)
## # A tibble: 8 × 3
## title year n
## <chr> <dbl> <int>
## 1 Ayneh 1997 2
## 2 Dracula 1931 2
## 3 Into the Woods 2014 2
## 4 Last Call at the Oasis 2011 2
## 5 Sleeping Beauty 1959 2
## 6 Terms of Endearment 1983 2
## 7 The Danish Girl 2015 2
## 8 Wonder Woman 2017 3
movies <- movies %>%
mutate(rating_cat = as.factor(rating)) %>%
drop_na(man_older, rating)
ggplot(movies, aes(x = rating_cat, y = man_older)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = .2) +
scale_x_discrete(labels = c("Fail", "No conversation", "Only about men", "Pass")) +
labs(x = "Bechdel test result", y = "Male age gap", title = "Something to talk about")
cor(movies$man_older, movies$rating)
## [1] -0.1453132
cor(movies$man_older, movies$rating, method = "spearman")
## [1] -0.1508434
model <- lm(man_older ~ rating_cat, data = movies)
summary(model)
##
## Call:
## lm(formula = man_older ~ rating_cat, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.018 -6.216 -0.809 5.982 40.183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.241 1.869 4.946 9.33e-07 ***
## rating_cat1 1.567 2.011 0.779 0.436
## rating_cat2 -0.424 2.091 -0.203 0.839
## rating_cat3 -2.223 1.929 -1.153 0.249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.06 on 768 degrees of freedom
## Multiple R-squared: 0.02433, Adjusted R-squared: 0.02051
## F-statistic: 6.383 on 3 and 768 DF, p-value: 0.0002833