Inference Lab

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.3.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(openintro)

Warning: package 'openintro' was built under R version 4.3.2

Loading required package: airports

Warning: package 'airports' was built under R version 4.3.2

Loading required package: cherryblossom

Warning: package 'cherryblossom' was built under R version 4.3.2

Loading required package: usdata

Warning: package 'usdata' was built under R version 4.3.2

library(infer)

Warning: package 'infer' was built under R version 4.3.3

library(skimr)

Warning: package 'skimr' was built under R version 4.3.3

data(yrbss)
yrbss %>% 
  skim()

Data summary
Name	Piped data
Number of rows	13583
Number of columns	13
_______________________
Column type frequency:
character	8
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
gender	12	1.00	4	6	2
grade	79	0.99	1	5	5
hispanic	231	0.98	3	8	2
race	2805	0.79	5	41	5
helmet_12m	311	0.98	5	12	6
text_while_driving_30d	918	0.93	1	13	8
hours_tv_per_school_day	338	0.98	1	12	7
school_night_hours_sleep	1248	0.91	1	3	7

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	77	0.99	16.16	1.26	12.00	15.00	16.00	17.00	18.00	▁▂▅▅▇
height	1004	0.93	1.69	0.10	1.27	1.60	1.68	1.78	2.11	▁▅▇▃▁
weight	1004	0.93	67.91	16.90	29.94	56.25	64.41	76.20	180.99	▆▇▂▁▁
physically_active_7d	273	0.98	3.90	2.56	0.00	2.00	4.00	7.00	7.00	▆▂▅▃▇
strength_training_7d	1176	0.91	2.95	2.58	0.00	0.00	3.00	5.00	7.00	▇▂▅▂▅

##Exercise 1

##There are 5 case studies

yrbss1 <- yrbss %>% 
  mutate(physical_3plus = if_else(physically_active_7d > 2, "yes", "no")) |>
filter(!is.na(weight)&!is.na(physical_3plus))

##Exercise 2

yrbss1 %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE))

# A tibble: 2 × 2
  physical_3plus mean_weight
  <chr>                <dbl>
1 no                    66.7
2 yes                   68.4

Exercise 3

violin plot

ggplot(yrbss1, aes(physical_3plus, weight, fill = physical_3plus))+
  geom_violin()

obs_diff <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff

Response: weight (numeric)
Explanatory: physical_3plus (factor)
# A tibble: 1 × 1
   stat
  <dbl>
1  1.77

##Exercise 4

##The data is normally distributed so we can do an inference test

##Exercise 5

null_dist <- yrbss1  %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
null_dist

Response: weight (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1,000 × 2
   replicate     stat
       <int>    <dbl>
 1         1  0.166  
 2         2  0.309  
 3         3 -0.124  
 4         4  0.202  
 5         5 -0.00315
 6         6 -0.489  
 7         7  0.317  
 8         8 -0.348  
 9         9 -0.313  
10        10  0.0759 
# ℹ 990 more rows

##Exercise 6

ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram() + 
  geom_vline(xintercept = pull(obs_diff), color = "purple")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

exercise 7

null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")

Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

# A tibble: 1 × 1
  p_value
    <dbl>
1       0

Exercise 8

The warning message is p value of 0 is an approximation of the population mean.

Exrercise 10

Calculate a 95% CI on height

boot_height <- yrbss1 |>
  filter(!is.na(height)) |>
  specify(response = height)|>
  generate(reps = 1000, type = "bootstrap") |>
  calculate(stat = "mean")
boot_height

Response: height (numeric)
# A tibble: 1,000 × 2
   replicate  stat
       <int> <dbl>
 1         1  1.69
 2         2  1.69
 3         3  1.69
 4         4  1.69
 5         5  1.69
 6         6  1.69
 7         7  1.69
 8         8  1.69
 9         9  1.69
10        10  1.69
# ℹ 990 more rows

boot_height |>
  get_confidence_interval(level = .95)

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69

Exercise 11

boot_height |>
  get_confidence_interval(level = .90)

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69

Exercise 12

HT for diff in mean height for yes and no for physical 3plus

null_height_dist  <- yrbss1 |>
  specify(height ~ physical_3plus) |>
  hypothesise(null = "independence") |>
  generate(reps = 1000, type = "permute") |>
  calculate(stat = "diff in means", order = c("yes", "no"))
null_height_dist

Response: height (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1,000 × 2
   replicate      stat
       <int>     <dbl>
 1         1 -0.00284 
 2         2 -0.000618
 3         3 -0.00164 
 4         4  0.00206 
 5         5  0.00222 
 6         6  0.000860
 7         7 -0.00191 
 8         8  0.000396
 9         9 -0.00393 
10        10  0.00195 
# ℹ 990 more rows

obs_height_dist  <- yrbss1 |>
  specify(height ~ physical_3plus) |>
  hypothesise(null = "independence") |>
  calculate(stat = "diff in means", order = c("yes", "no"))

Message: The independence null hypothesis does not inform calculation of the
observed statistic (a difference in means) and will be ignored.

obs_height_dist

Response: height (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1 × 1
    stat
   <dbl>
1 0.0376

ggplot(null_height_dist, aes(stat))+
  geom_histogram() +
  geom_vline(xintercept = pull(obs_height_dist), color = "purple")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

null_height_dist %>%
  get_p_value(obs_stat = obs_height_dist, direction = "two_sided")

Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

# A tibble: 1 × 1
  p_value
    <dbl>
1       0

calculate exact proportion of the sample that responded this way

##The proportion who received a callback in the dataset was 0.080.*

The 95% confidence interval can be calculated as the sample proportion plus or minus two standard errors of the sample proportion

##The bootstrap is done with the function: specify()

We do this many times to create many bootstrap replicate data sets.

##Do this with the function generate()

Next, for each replicate, we calculate the sample statistic, in this case: the proportion of respondents that said “yes” to receiving callbacks.

Do this with the function calculate()

##The standard deviation of the stat variable in this data frame (the bootstrap distribution) is the bootstrap standard error and it can be calculated using the summarize() function.

We can use this value, along with our point estimate, to roughly calculate a 95% confidence interval:

##\(\hat{p} \pm z^*se\)

We are 95% confident that the true proportion of applicants receiving callbacks is between 7.28% and 8.81%.

The normal distribution for confidence interval

##Another option for calculating the CI is by estimating it by using the Normal Distribution (the bell curve)

##1. observations are independent ##2. n is large (S-F condition is met)