Inference Lab

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
Warning: package 'openintro' was built under R version 4.3.2
Loading required package: airports
Warning: package 'airports' was built under R version 4.3.2
Loading required package: cherryblossom
Warning: package 'cherryblossom' was built under R version 4.3.2
Loading required package: usdata
Warning: package 'usdata' was built under R version 4.3.2
library(infer)
Warning: package 'infer' was built under R version 4.3.3
library(skimr)
Warning: package 'skimr' was built under R version 4.3.3
data(yrbss)
yrbss %>% 
  skim()
Data summary
Name Piped data
Number of rows 13583
Number of columns 13
_______________________
Column type frequency:
character 8
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gender 12 1.00 4 6 0 2 0
grade 79 0.99 1 5 0 5 0
hispanic 231 0.98 3 8 0 2 0
race 2805 0.79 5 41 0 5 0
helmet_12m 311 0.98 5 12 0 6 0
text_while_driving_30d 918 0.93 1 13 0 8 0
hours_tv_per_school_day 338 0.98 1 12 0 7 0
school_night_hours_sleep 1248 0.91 1 3 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 77 0.99 16.16 1.26 12.00 15.00 16.00 17.00 18.00 ▁▂▅▅▇
height 1004 0.93 1.69 0.10 1.27 1.60 1.68 1.78 2.11 ▁▅▇▃▁
weight 1004 0.93 67.91 16.90 29.94 56.25 64.41 76.20 180.99 ▆▇▂▁▁
physically_active_7d 273 0.98 3.90 2.56 0.00 2.00 4.00 7.00 7.00 ▆▂▅▃▇
strength_training_7d 1176 0.91 2.95 2.58 0.00 0.00 3.00 5.00 7.00 ▇▂▅▂▅

##Exercise 1

##There are 5 case studies

yrbss1 <- yrbss %>% 
  mutate(physical_3plus = if_else(physically_active_7d > 2, "yes", "no")) |>
filter(!is.na(weight)&!is.na(physical_3plus))

##Exercise 2

yrbss1 %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE))
# A tibble: 2 × 2
  physical_3plus mean_weight
  <chr>                <dbl>
1 no                    66.7
2 yes                   68.4

Exercise 3

violin plot

ggplot(yrbss1, aes(physical_3plus, weight, fill = physical_3plus))+
  geom_violin()

obs_diff <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff
Response: weight (numeric)
Explanatory: physical_3plus (factor)
# A tibble: 1 × 1
   stat
  <dbl>
1  1.77

##Exercise 4

##The data is normally distributed so we can do an inference test

##Exercise 5

null_dist <- yrbss1  %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
null_dist
Response: weight (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1,000 × 2
   replicate     stat
       <int>    <dbl>
 1         1  0.166  
 2         2  0.309  
 3         3 -0.124  
 4         4  0.202  
 5         5 -0.00315
 6         6 -0.489  
 7         7  0.317  
 8         8 -0.348  
 9         9 -0.313  
10        10  0.0759 
# ℹ 990 more rows

##Exercise 6

ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram() + 
  geom_vline(xintercept = pull(obs_diff), color = "purple")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

exercise 7

null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
# A tibble: 1 × 1
  p_value
    <dbl>
1       0

Exercise 8

The warning message is p value of 0 is an approximation of the population mean.

Exrercise 10

Calculate a 95% CI on height

boot_height <- yrbss1 |>
  filter(!is.na(height)) |>
  specify(response = height)|>
  generate(reps = 1000, type = "bootstrap") |>
  calculate(stat = "mean")
boot_height
Response: height (numeric)
# A tibble: 1,000 × 2
   replicate  stat
       <int> <dbl>
 1         1  1.69
 2         2  1.69
 3         3  1.69
 4         4  1.69
 5         5  1.69
 6         6  1.69
 7         7  1.69
 8         8  1.69
 9         9  1.69
10        10  1.69
# ℹ 990 more rows
boot_height |>
  get_confidence_interval(level = .95)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69

Exercise 11

boot_height |>
  get_confidence_interval(level = .90)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69

Exercise 12

HT for diff in mean height for yes and no for physical 3plus

null_height_dist  <- yrbss1 |>
  specify(height ~ physical_3plus) |>
  hypothesise(null = "independence") |>
  generate(reps = 1000, type = "permute") |>
  calculate(stat = "diff in means", order = c("yes", "no"))
null_height_dist
Response: height (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1,000 × 2
   replicate      stat
       <int>     <dbl>
 1         1 -0.00284 
 2         2 -0.000618
 3         3 -0.00164 
 4         4  0.00206 
 5         5  0.00222 
 6         6  0.000860
 7         7 -0.00191 
 8         8  0.000396
 9         9 -0.00393 
10        10  0.00195 
# ℹ 990 more rows
obs_height_dist  <- yrbss1 |>
  specify(height ~ physical_3plus) |>
  hypothesise(null = "independence") |>
  calculate(stat = "diff in means", order = c("yes", "no"))
Message: The independence null hypothesis does not inform calculation of the
observed statistic (a difference in means) and will be ignored.
obs_height_dist
Response: height (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1 × 1
    stat
   <dbl>
1 0.0376
ggplot(null_height_dist, aes(stat))+
  geom_histogram() +
  geom_vline(xintercept = pull(obs_height_dist), color = "purple")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

null_height_dist %>%
  get_p_value(obs_stat = obs_height_dist, direction = "two_sided")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
# A tibble: 1 × 1
  p_value
    <dbl>
1       0

calculate exact proportion of the sample that responded this way

##The proportion who received a callback in the dataset was 0.080.*

The 95% confidence interval can be calculated as the sample proportion plus or minus two standard errors of the sample proportion

##The bootstrap is done with the function: specify()

We do this many times to create many bootstrap replicate data sets.

##Do this with the function generate()

Next, for each replicate, we calculate the sample statistic, in this case: the proportion of respondents that said “yes” to receiving callbacks.

Do this with the function calculate()

##The standard deviation of the stat variable in this data frame (the bootstrap distribution) is the bootstrap standard error and it can be calculated using the summarize() function.

We can use this value, along with our point estimate, to roughly calculate a 95% confidence interval:

##\(\hat{p} \pm z^*se\)

We are 95% confident that the true proportion of applicants receiving callbacks is between 7.28% and 8.81%.

The normal distribution for confidence interval

##Another option for calculating the CI is by estimating it by using the Normal Distribution (the bell curve)

If

##1. observations are independent ##2. n is large (S-F condition is met)