library(infer) #install_github("tidymodels/infer")
library(tidyverse)

Inference for one mean

The most exciting inference of them all!

First, calculate observed mean and save as obs_stat. This will be used in all settings.

# calculate observed mean
obs_stat <- gss %>%
  specify(response = hours) %>%
  calculate(stat = "mean")

Simulation-based inference

Provided as reference.

set.seed(1234)

Hypothesis testing

# generate a null distribution of means
null_dist <- gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

# get p-value
get_p_value(
  null_dist,
  obs_stat = obs_stat,
  direction = "both"
)
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.056
# visualize null_dist with p-value
visualize(null_dist) +
  shade_p_value(obs_stat = obs_stat, direction = "both")

Confidence intervals

# generate a boot distribution of means
boot_dist <- gss %>%
  specify(response = hours) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

# get ci
ci <- get_confidence_interval(boot_dist,
  level = 0.95,
  type = "se", point_estimate = obs_stat
)

# visualize boot_dist with ci
visualize(boot_dist) +
  shade_confidence_interval(endpoints = ci)

CLT-based inference

Hypothesis testing

Option 1 - A new word instead of generate()

Previous suggestions include assume(), theorize(), etc.

# generate a null distribution of means
null_dist <- gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  ASSUME(distribution = "t", df = 499)

# get p-value
get_p_value(
  null_dist,
  obs_stat = obs_stat,
  direction = "both"
)

# visualize null_dist with p-value
visualize(null_dist) +
  shade_p_value(obs_stat = obs_stat, direction = "both")

Option 2 - Define the distribution directly in get_confidence_interval()

# do nothing instead of generate

# get p-value
get_p_value(
  "t", df = 499,
  obs_stat = obs_stat,
  direction = "both"
)

# visualize null_dist with p-value
visualize(null_dist) +
  shade_p_value(obs_stat = obs_stat, direction = "both")

Option 3 - Define the distribution separately with a new verb

# ASSUME a sampling distribution
null_dist <- ASSUME("t", df = 499)

# get p-value
get_p_value(
  null_dist,
  obs_stat = obs_stat,
  direction = "both"
)

# visualize null_dist with p-value
visualize(null_dist) +
  shade_p_value(obs_stat = obs_stat, direction = "both")

Confidence interval

Option 1 - A new word instead of generate()

Previous suggestions include assume(), theorize(), etc.

# generate a sampling distribution of means
sampling_dist <- gss %>%
  specify(response = hours) %>%
  ASSUME(distribution = "t", df = 499)

# get ci
# note: no type argument
ci <- get_confidence_interval(sampling_dist,
                              level = 0.95,
                              point_estimate = obs_stat)

# visualize sampling_dist with ci
# note: I've never seen a CLT-based ci visualized this way
visualize(sampling_dist) +
  shade_confidence_interval(endpoints = ci)

Option 2 - Define the distribution directly in get_confidence_interval()

# do nothing instead of generate

# get ci
ci <- get_confidence_interval("t", df = 499,
                              level = 0.95,
                              point_estimate = obs_stat)
# visualize sampling_dist with ci
# note: While I've never seen a CLT-based ci visualized this way anyway,
# I don't know how visualize could work because the following wouldn't work
# since the t-distribution is centered at 0
visualize("t", df = 499) +
  shade_confidence_interval(endpoints = ci)

Option 3 - Define the distribution separately with a new verb

# ASSUME a sampling distribution
sampling_dist <- ASSUME("t", df = 499)

# get ci
ci <- get_confidence_interval(sampling_dist,
                              level = 0.95,
                              point_estimate = obs_stat)

# visualize sampling_dist with ci
# note: While I've never seen a CLT-based ci visualized this way anyway,
# I don't know how visualize could work because the following wouldn't work
# since the t-distribution is centered at 0
visualize(sampling_dist) +
  shade_confidence_interval(endpoints = ci)