Inference Simulations Exercise

Setup

set.seed(123)

# Import libraries
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(ggthemes)

Define the Simulation Function

make_tea_data <- function(n_total = 48, sigma = 1.25, delta = 3) {
  n_half <- n_total / 2
  tibble(condition = c(rep("milk first", n_half), rep("tea first", n_half)),
         rating = c(round(rnorm(n_half, mean = 3.5 + delta, sd = sigma)), 
                    round(rnorm(n_half, mean = 3.5, sd = sigma)))) |>
    mutate(rating = if_else(rating > 10, 10, rating),
           rating = if_else(rating < 1, 1, rating))
}

Make Data Frames

tea_data_highn <- make_tea_data(n_total = 48, delta=3)

Calculate Mean of Each Condition

# Summarize the mean rating 
tea_data_highn %>%
  group_by(condition) %>%
  summarize(mean_rating = mean(rating))

# A tibble: 2 × 2
  condition  mean_rating
  <chr>            <dbl>
1 milk first        6.54
2 tea first         3.58

Perform T-test

out_high_n <- tea_data_highn %>%
  t.test(rating ~ condition, data = ., paired = FALSE, var.equal = TRUE)

Simulate 1000 Experiments

# Simulate 1000 experiments
samps_highn <- tibble(sim = 1:1000) |>
  mutate(data = map(sim, \(i) make_tea_data(n_total = 48, delta=3))) |>
  unnest(cols = data)

Summarize Simulations

tea_data_highn_summary <- samps_highn |>
  group_by(sim, condition) |> 
  summarise(mean_rating = mean(rating)) |> 
  group_by(sim) |> 
  summarise(delta = mean_rating[condition == "milk first"] -
              mean_rating[condition == "tea first"])

`summarise()` has grouped output by 'sim'. You can override using the `.groups`
argument.

Plot Difference for Higher-n

ggplot(data=tea_data_highn_summary, aes(x=delta)) +
  geom_histogram(alpha=.8, bins=20) +
  theme_few()

Null Model Simulation

null_model <- tibble(sim = 1:1000) |>
  mutate(data = map(sim, \(i) make_tea_data(n_total = 48, delta = 0))) |>
  unnest(cols = data)

null_model_summary <- null_model |>
  group_by(sim, condition) |>
  summarise(mean_rating = mean(rating)) |>
  group_by(sim) |>
  summarise(delta = mean_rating[condition == "milk first"] -
              mean_rating[condition == "tea first"])

`summarise()` has grouped output by 'sim'. You can override using the `.groups`
argument.

ggplot(data=null_model_summary, aes(x=delta)) +
  geom_histogram(alpha=.8, bins=20) +
  theme_few()

Permutations

tea_data_highn_shuffled <- samps_highn %>%
  group_by(sim) %>%
  mutate(condition_shuffled = sample(condition))

tea_data_highn_shuffled_summary <- tea_data_highn_shuffled %>%
  group_by(condition_shuffled, sim) %>% 
  summarize(mean = mean(rating),
            sd = sd(rating)) %>%
  ungroup() %>% 
  summarize(delta = diff(mean))

`summarise()` has grouped output by 'condition_shuffled'. You can override
using the `.groups` argument.

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.

hist(tea_data_highn_shuffled_summary$delta)

Visualizing Shuffled Data

ggplot(data = tea_data_highn_shuffled %>% filter(sim==3),
       mapping = aes(x = condition_shuffled, y = rating))+
  geom_point(mapping = aes(color = condition),
             alpha=.8,
             position = position_jitter(height = .1,
                                        width = 0.1)) +
  stat_summary(fun.data = mean_cl_boot,
               geom = "linerange",
               size = 1) +
  stat_summary(fun = "mean",
               geom = "point",
               shape = 21,
               color = "black",
               fill = "white") + 
  scale_y_continuous(breaks = 0:10,
                     labels = 0:10,
                     limits = c(0, 10))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Calculate Actual Difference

difference_actual = tea_data_highn %>%
  group_by(condition) %>% 
  summarize(mean = mean(rating)) %>% 
  pull(mean) %>% 
  diff()

Plot Distribution of Differences

ggplot(data = tea_data_highn_shuffled_summary, aes(x=delta)) +
  geom_histogram(aes(y = stat(density)),
                 color = "black",
                 fill = "lightblue",
                 binwidth = 0.05) + 
  stat_density(geom = "line",
               size = 1.5,
               bw = 0.2) +
  geom_vline(xintercept = difference_actual, color = "red", size = 2) +
  labs(x = "difference between means")

Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

Calculate p-value

tea_data_highn_shuffled_summary %>% 
  summarize(p_value = sum(delta <= difference_actual)/n())

# A tibble: 1 × 1
  p_value
    <dbl>
1       0

Plot Null vs Empirical Simulations

ggplot(data=tea_data_highn_shuffled_summary, aes(x=delta)) +
  geom_histogram(alpha=.4, bins=20, color='blue', fill='blue') +
  geom_histogram(alpha=.8, bins=20, data=tea_data_highn_summary) +
  theme_few()

Confidence Intervals

tea_dataset = tea_data_highn

tea_ratings <- filter(tea_dataset, condition == "tea first")$rating
milk_ratings <- filter(tea_dataset, condition == "milk first")$rating

# Calculate CI on the effect
n_tea <- length(tea_ratings)
n_milk <- length(milk_ratings)
sd_tea <- sd(tea_ratings)
sd_milk <- sd(milk_ratings)

tea_sd_pooled <- sqrt(((n_tea - 1) * sd_tea ^ 2 + (n_milk - 1) * sd_milk ^ 2) / 
                        (n_tea + n_milk - 2))

tea_se <- tea_sd_pooled * sqrt((1 / n_tea) + (1 / n_milk))

delta_hat <- mean(milk_ratings) - mean(tea_ratings)
tea_ci_lower <- delta_hat - tea_se * qnorm(0.975)
tea_ci_upper <- delta_hat + tea_se * qnorm(0.975)

# Calculate 95% CI with t-distribution
num_observations = length(tea_dataset$rating)
df = num_observations-2
tea_ci_lower_ttest <- delta_hat - tea_se * qt(0.975,df)
tea_ci_upper_ttest <- delta_hat + tea_se * qt(0.975,df)

t.test(tea_ratings, milk_ratings, var.equal=TRUE)


    Two Sample t-test

data:  tea_ratings and milk_ratings
t = -8.3198, df = 46, p-value = 1.003e-10
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -3.674070 -2.242596
sample estimates:
mean of x mean of y 
 3.583333  6.541667

Plot Confidence Intervals for Each Condition

confidence_level=.95

tea_data_highn_summary_cis <- tea_data_highn %>%
  group_by(condition) %>%
  summarize(cond_mean = mean(rating), cond_sd = sd(rating), n=length(rating)) %>%
  mutate(error = qt(1 - (1 - confidence_level)/2, df = n - 1)* (cond_sd/sqrt(n))) %>%
  mutate(ci_upper = cond_mean + error, ci_lower = cond_mean - error) %>%
  mutate(se_upper = cond_mean + cond_sd/sqrt(n), se_lower = cond_mean - cond_sd/sqrt(n))

ggplot(data = tea_data_highn, aes(x=condition, y=rating, col=condition))  +
  geom_jitter(width=.1, height=0, alpha=.3) +
  theme_few() +
  geom_pointrange(data = tea_data_highn_summary_cis, aes(x=condition, y = cond_mean, ymin = ci_lower, ymax = ci_upper))

Observations on the Differences Between d = 1.5 and d = 3

When comparing the results of simulations with different effect sizes (delta = 1.5 and delta = 3), several differences emerge:

Mean Differences:
- When delta = 1.5, the mean differences between the conditions are smaller compared to the larger difference observed when delta = 3. This is expected since delta controls the true difference between the two groups.
Distribution of Simulated Differences:
- With delta = 1.5, the histogram of simulated differences shows more overlap between the two conditions, and the distribution is more centered around zero. There is more variability in the differences across simulations, indicating that smaller effect sizes are more prone to sampling error.
- When delta = 3, the differences between conditions are more pronounced, and the distribution shifts away from zero. The larger effect size leads to more consistent and distinct differences between the groups across simulations.