library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(pwr)
## Warning: package 'pwr' was built under R version 4.3.3
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test
data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")

Hypothesis 1:

Null Hypothesis (H0): There is no significant difference in the mean age difference between couples in movies directed by Woody Allen and movies directed by other directors.

Alternative Hypothesis (H1): The mean age difference between couples in movies directed by Woody Allen is significantly different from the mean age difference in movies directed by other directors.

Statistical assumptions:

Neyman-Pearson Testing

woody_allen_movies <- subset(data, director == "Woody Allen")

woody_allen_mean_age_difference <- mean(woody_allen_movies$age_difference)

other_directors_movies <- subset(data, director != "Woody Allen")

other_directors_mean_age_difference <- mean(other_directors_movies$age_difference)
observed_diff <- abs(woody_allen_mean_age_difference- other_directors_mean_age_difference)
bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (e.g., a vector or data frame)
    x_sample <- sample_n(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
diff_in_avg <- function (x_data) {
  woody_allen_movies <- subset(data, director == "Woody Allen")

  woody_allen_mean_age_difference <- mean(woody_allen_movies$age_difference)
  
  other_directors_movies <- subset(data, director != "Woody Allen")
  
  other_directors_mean_age_difference <- mean(other_directors_movies$age_difference)
  
  diff <- abs(woody_allen_mean_age_difference - 
           other_directors_mean_age_difference)
  
  return(diff)
}

diffs_in_avgs <- bootstrap(data, diff_in_avg, n_iter = 100)
ggplot() +
  geom_function(xlim = c(-20, 20), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff)))) +
  labs(title = "Bootstrapped Sampling Distribution of mean Light Difference",
       x = "Difference in Mean of average_age of actors",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-20, 20, 5)) +
  theme_minimal()

cohen.d(d = woody_allen_movies$age_difference,
        f = other_directors_movies$age_difference)
## 
## Cohen's d
## 
## d estimate: 1.175968 (large)
## 95 percent confidence interval:
##     lower     upper 
## 0.7308021 1.6211347

Cohen’s d value is 1.17 which is very high, hence we can reject the null hypothesis

f_sampling <- function(x) dnorm(x, mean = 0, 
                                sd = sd(diffs_in_avgs))

ggplot() +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(observed_diff, 20),
                geom = "area") +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(-20, -observed_diff),
                geom = "area") +
  geom_function(xlim = c(-20, 20), 
                fun = f_sampling) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff, 1)))) +
  labs(title = "Bootstrapped Sampling Distribution of Light Differences",
       x = "Difference in Ages",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-20, 20, 5)) +
  scale_fill_manual(values = 'lightblue') +
  theme_minimal()

diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)

# proportion of times the difference is more extreme
paste("p-value ", 
      sum(abs(observed_diff) < abs(diffs_in_avgs_d)) /
        length(diffs_in_avgs_d))
## [1] "p-value  0"

Conclusions:

With P value being 0 and effect size being 1.17 we can reject the null hypothesis.

Hypothesis 2:

Null Hypothesis (H0): There is no significant difference in the mean age difference between couples in movies released before and after the year 2000.

Alternative Hypothesis (H1): The mean age difference between couples in movies released before and after the year 2000 is significantly different.

Statistical assumptions:

  • Alpha Level or Significance Value:
    • Value Chosen: 0.05
    • Reason: Alpha levels are chosen based on the context and based on how low the level of Type-1 error should be. Since, this isn’t a very important or a critical issue I am going with the standard value of 0.05 which is a standard choice that aligns with common statistical practices.
  • Power Level:
    • Value Chosen: 0.08
    • Reason: This is the probability of correctly rejecting the Null Hypothesis. I chose 0.08 because it is a standard choice that aligns with common statistical practices.
  • Minimum Effect Size:
    • Value Chosen: 0.20
    • Reason: This is the the minimum value for effect size which should be considered while comparing two samples of the same population, smaller the effect size smaller the difference between the two sampling distributions. I am going with 0.20 as my minimum effect size as it is a common statistical practices.

Neyman-Pearson Testing

data$release_year <- as.numeric(data$release_year)

before_2000 <- subset(data, release_year < 2000)
after_2000 <- subset(data, release_year >= 2000)

avg_age_diff_before_2000 <- mean(before_2000$age_difference, na.rm = TRUE)
avg_age_diff_after_2000 <- mean(after_2000$age_difference, na.rm = TRUE)
observed_diff <- abs(avg_age_diff_before_2000 - avg_age_diff_after_2000)
diff_in_avg <- function (x_data) {
  before_2000 <- subset(data, release_year < 2000)
after_2000 <- subset(data, release_year >= 2000)

avg_age_diff_before_2000 <- mean(before_2000$age_difference, na.rm = TRUE)
avg_age_diff_after_2000 <- mean(after_2000$age_difference, na.rm = TRUE)
  diff <- abs(avg_age_diff_before_2000 - 
           avg_age_diff_after_2000)
  
  return(diff)
}

diffs_in_avgs <- bootstrap(data, diff_in_avg, n_iter = 100)
ggplot() +
  geom_function(xlim = c(-5, 5), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff)))) +
  labs(title = "Bootstrapped Sampling Distribution of mean Light Difference",
       x = "Observed Mean difference in age",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-5, 5, 1)) +
  theme_minimal()

cohen.d(d = after_2000$age_difference,
        f = before_2000$age_difference)
## 
## Cohen's d
## 
## d estimate: -0.346253 (small)
## 95 percent confidence interval:
##      lower      upper 
## -0.4691451 -0.2233609

The cohen’s d value is very small/insignificant, hence with such small value we fail reject the null hypothesis. We can continue our hypothesis testing further.

We are assuming that the acceptable age difference is 7 years

test <- pwrss.t.2means(mu1 = 7, 
                       sd1 = sd(pluck(data, "age_difference")),
                       kappa = 1,
                       power = .80, alpha = 0.05, 
                       alternative = "not equal")
##  Difference between Two means 
##  (Independent Samples t Test) 
##  H0: mu1 = mu2 
##  HA: mu1 != mu2 
##  ------------------------------ 
##   Statistical power = 0.8 
##   n1 = 25 
##   n2 = 25 
##  ------------------------------ 
##  Alternative = "not equal" 
##  Degrees of freedom = 48 
##  Non-centrality parameter = 2.908 
##  Type I error rate = 0.05 
##  Type II error rate = 0.2
plot(test)

Fisher’s Significance Testing

P Value

f_sampling <- function(x) dnorm(x, mean = 0, 
                                sd = sd(diffs_in_avgs))

ggplot() +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(observed_diff, 40),
                geom = "area") +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(-40, -observed_diff),
                geom = "area") +
  geom_function(xlim = c(-40, 40), 
                fun = f_sampling) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff, 1)))) +
  labs(title = "Bootstrapped Sampling Distribution of Light Differences",
       x = "Difference in Age Difference Calculated",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-40, 40, 10)) +
  scale_fill_manual(values = 'lightblue') +
  theme_minimal()

lm_model <- lm(age_difference ~ release_year, data = data)

se_coef <- summary(lm_model)$coef["release_year", "Std. Error"]

t_statistic <- coef(lm_model)["release_year"] / se_coef

df <- nrow(data) - length(coef(lm_model))

p_value <- 2 * pt(abs(t_statistic), df = df, lower.tail = FALSE)

cat("t-statistic:", t_statistic, "\n")
## t-statistic: -7.086887
cat("p-value:", p_value, "\n")
## p-value: 2.383761e-12

With this we cannot reject as well as accept the null hypothesis.