#loading libraries and data into the file
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)

bechdel_data_movies <- read_csv("C:/Users/Lauren/Documents/Stats Data/movies.csv")
## Rows: 1794 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): imdb, title, test, clean_test, binary, domgross, intgross, code, d...
## dbl  (7): year, budget, budget_2013, period_code, decade_code, metascore, im...
## num  (1): imdb_votes
## lgl  (2): response, error
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Your RMarkdown notebook for this data dive should contain the following:

After having explored your data set over the past few weeks, you should already have some questions. For this week, we will do AB Testing within the context of your data set. So, you’ll be using hypothesis testing to calculate some difference between two groups.

For each of the following, you’ll need at least one main variable, and some way to define “Group A” and “Group B” (e.g., if you have a categorical column, you could consider “category 1” and “[the rest]”). Your main variable should either be continuous or binary (i.e., “success” or “failure”).

Devise two different null hypotheses based on two different columns data (i.e., each hypothesis gets a different column of data).

Hypothesis 1:

The null hypothesis: There is no difference between the profitability (gross income (domgross_2013 and intgross_2013) - budget(budget_2013)) between films that pass the Bechdel test and films that fail.

The alternate hypothesis: The movies that pass the bechdel test are more profitable than movies that do not.

Alpha: .05 is a common starting point and there is no reason to deviate because the cost of potential false positive isn’t very high. This data set is about entertainment and investing, not something significant like medicine or public health.

Beta: .1 There is some importance to this test despite it dealing with the realm of entertainment. If we were to have a false negative, where we conclude that Bechdel test passing movies are not as profitable as failing movies, this could lead to fewer roles for women actors and fewer women characters in film. While not the end of the world, it could negatively impact the self image and self esteem of young girls and women who would otherwise look up to these actresses or characters. Thus, beta = .1 feels important enough to note without giving it undue weight.

Delta: 10% difference relative to median budget (median = $36,995,786, so a difference of 3.7M. )

Cohen’s D: 0.009895347

This makes n equal 214618 for passing and non-passing movies.

This is not achievable with our current data set, and therefore we must conclude that we do not have enough data to perform a hypothesis test using the Neyman-Pearson framework. This is significant because we can’t test for significance due to the small data set.

median_budget <- median(bechdel_data_movies$budget_2013)
print(median_budget)
## [1] 36995786
bechdel_data_movies$domgross_2013 <- as.numeric(bechdel_data_movies$domgross_2013, na.rm = TRUE)
## Warning: NAs introduced by coercion
bechdel_data_movies$intgross_2013 <- as.numeric(bechdel_data_movies$intgross_2013, na.rm = TRUE)
## Warning: NAs introduced by coercion
bechdel_data_movies$budget_2013 <- as.numeric(bechdel_data_movies$budget_2013, na.rm = TRUE)


bechdel_data_movies <- bechdel_data_movies |>
  mutate(profitability = (domgross_2013 + intgross_2013) - budget_2013)
pass_group <- bechdel_data_movies |>
  filter(binary == "PASS")
fail_group <- bechdel_data_movies |>
  filter(binary == "FAIL")


s1 <- sd(pass_group$profitability, na.rm = TRUE)
s2 <- sd(fail_group$profitability, na.rm = TRUE)

n1 <- nrow(pass_group)
n2 <- nrow(fail_group)


s_pooled <- sqrt(((n1 - 1)*s1^2 + (n2 - 1)*s2^2) / (n1 + n2 - 2))

delta <- 3700000



cohens_d <- delta / s_pooled
print(cohens_d)
## [1] 0.009895347
power.t.test(n = NULL, d = .009895347, sig.level = 0.05, power = .9, 
    type = c("two.sample"),
    alternative = c("two.sided"))
## 
##      Two-sample t test power calculation 
## 
##               n = 214618
##           delta = 0.009895347
##              sd = 1
##       sig.level = 0.05
##           power = 0.9
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
print(n1)
## [1] 803
print(n2)
## [1] 991

Visualization:

hyp1_vis <- hyp1_vis <- ggplot(bechdel_data_movies, aes(x = binary, fill = binary)) + geom_bar(width = 0.7, alpha = 0.85, color = "white") + geom_hline(yintercept = 214618, linetype = "dashed", color = "firebrick", linewidth = 1) + scale_fill_manual(values = c("0" = "#FF6B6B", "1" = "#4D96FF")) +
  labs(title = "Number of Films by Bechdel Test Outcome", subtitle = "Dashed line indicates reference value (214,618)", x = "Bechdel Test Result (0 = Fail, 1 = Pass)", y = "Count of Films", fill = "Bechdel Result")
print(hyp1_vis)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

Hypothesis 2:

Null Hypothesis: The mean IMDB rating is the same for movies that pass and fail the Bechdel test.

Alternate Hypothesis: The mean IMDB rating differs between the two groups.

t.test(imdb_rating ~ binary, data = bechdel_data_movies)
## 
##  Welch Two Sample t-test
## 
## data:  imdb_rating by binary
## t = 5.9089, df = 1496.3, p-value = 4.257e-09
## alternative hypothesis: true difference in means between group FAIL and group PASS is not equal to 0
## 95 percent confidence interval:
##  0.1900477 0.3789273
## sample estimates:
## mean in group FAIL mean in group PASS 
##           6.885202           6.600714

The p value here is much, much lower than .05. A setting of .05 as a p-value usually means that the result is statically significant. There is nothing about the data set that requires a different threshold for significance, therefore the differences in the mean can be considered due to the fact the movie passes or fails the bechdel test.

hyp2_vis <- ggplot(bechdel_data_movies, aes(x = factor(binary),y = imdb_rating, fill = factor(binary))) + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c("tomato", "steelblue")) + labs(x = "Bechdel Test)",
       y = "IMDb Rating",
       title = "IMDb Ratings by Bechdel Test Outcome")

print(hyp2_vis)
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_boxplot()`).