library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(pwr)
## Warning: package 'pwr' was built under R version 4.3.3
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")
Null Hypothesis (H0): There is no significant difference in the mean age difference between couples in movies directed by Woody Allen and movies directed by other directors.
Alternative Hypothesis (H1): The mean age difference between couples in movies directed by Woody Allen is significantly different from the mean age difference in movies directed by other directors.
woody_allen_movies <- subset(data, director == "Woody Allen")
woody_allen_mean_age_difference <- mean(woody_allen_movies$age_difference)
other_directors_movies <- subset(data, director != "Woody Allen")
other_directors_mean_age_difference <- mean(other_directors_movies$age_difference)
observed_diff <- abs(woody_allen_mean_age_difference- other_directors_mean_age_difference)
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (e.g., a vector or data frame)
x_sample <- sample_n(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
woody_allen_movies <- subset(data, director == "Woody Allen")
woody_allen_mean_age_difference <- mean(woody_allen_movies$age_difference)
other_directors_movies <- subset(data, director != "Woody Allen")
other_directors_mean_age_difference <- mean(other_directors_movies$age_difference)
diff <- abs(woody_allen_mean_age_difference -
other_directors_mean_age_difference)
return(diff)
}
diffs_in_avgs <- bootstrap(data, diff_in_avg, n_iter = 100)
ggplot() +
geom_function(xlim = c(-20, 20),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of mean Light Difference",
x = "Difference in Mean of average_age of actors",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-20, 20, 5)) +
theme_minimal()
cohen.d(d = woody_allen_movies$age_difference,
f = other_directors_movies$age_difference)
##
## Cohen's d
##
## d estimate: 1.175968 (large)
## 95 percent confidence interval:
## lower upper
## 0.7308021 1.6211347
Cohen’s d value is 1.17 which is very high, hence we can reject the null hypothesis
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_diff, 20),
geom = "area") +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(-20, -observed_diff),
geom = "area") +
geom_function(xlim = c(-20, 20),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Light Differences",
x = "Difference in Ages",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-20, 20, 5)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)
# proportion of times the difference is more extreme
paste("p-value ",
sum(abs(observed_diff) < abs(diffs_in_avgs_d)) /
length(diffs_in_avgs_d))
## [1] "p-value 0"
With P value being 0 and effect size being 1.17 we can reject the null hypothesis.
Null Hypothesis (H0): There is no significant difference in the mean age difference between couples in movies released before and after the year 2000.
Alternative Hypothesis (H1): The mean age difference between couples in movies released before and after the year 2000 is significantly different.
data$release_year <- as.numeric(data$release_year)
before_2000 <- subset(data, release_year < 2000)
after_2000 <- subset(data, release_year >= 2000)
avg_age_diff_before_2000 <- mean(before_2000$age_difference, na.rm = TRUE)
avg_age_diff_after_2000 <- mean(after_2000$age_difference, na.rm = TRUE)
observed_diff <- abs(avg_age_diff_before_2000 - avg_age_diff_after_2000)
diff_in_avg <- function (x_data) {
before_2000 <- subset(data, release_year < 2000)
after_2000 <- subset(data, release_year >= 2000)
avg_age_diff_before_2000 <- mean(before_2000$age_difference, na.rm = TRUE)
avg_age_diff_after_2000 <- mean(after_2000$age_difference, na.rm = TRUE)
diff <- abs(avg_age_diff_before_2000 -
avg_age_diff_after_2000)
return(diff)
}
diffs_in_avgs <- bootstrap(data, diff_in_avg, n_iter = 100)
ggplot() +
geom_function(xlim = c(-5, 5),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of mean Light Difference",
x = "Observed Mean difference in age",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-5, 5, 1)) +
theme_minimal()
cohen.d(d = after_2000$age_difference,
f = before_2000$age_difference)
##
## Cohen's d
##
## d estimate: -0.346253 (small)
## 95 percent confidence interval:
## lower upper
## -0.4691451 -0.2233609
The cohen’s d value is very small/insignificant, hence with such small value we fail reject the null hypothesis. We can continue our hypothesis testing further.
We are assuming that the acceptable age difference is 7 years
test <- pwrss.t.2means(mu1 = 7,
sd1 = sd(pluck(data, "age_difference")),
kappa = 1,
power = .80, alpha = 0.05,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 25
## n2 = 25
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 48
## Non-centrality parameter = 2.908
## Type I error rate = 0.05
## Type II error rate = 0.2
plot(test)
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_diff, 40),
geom = "area") +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(-40, -observed_diff),
geom = "area") +
geom_function(xlim = c(-40, 40),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Light Differences",
x = "Difference in Age Difference Calculated",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-40, 40, 10)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
lm_model <- lm(age_difference ~ release_year, data = data)
se_coef <- summary(lm_model)$coef["release_year", "Std. Error"]
t_statistic <- coef(lm_model)["release_year"] / se_coef
df <- nrow(data) - length(coef(lm_model))
p_value <- 2 * pt(abs(t_statistic), df = df, lower.tail = FALSE)
cat("t-statistic:", t_statistic, "\n")
## t-statistic: -7.086887
cat("p-value:", p_value, "\n")
## p-value: 2.383761e-12
With this we cannot reject as well as accept the null hypothesis.