Inference for Numerical Data Lab

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
Loading required package: airports
Loading required package: cherryblossom
Loading required package: usdata
library(infer)
library(skimr)
data(yrbss)
?yrbss

##Exercise 1 What are the cases in this data set? How many cases are there in our sample?

glimpse(yrbss)
Rows: 13,583
Columns: 13
$ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1…
$ gender                   <chr> "female", "female", "female", "female", "fema…
$ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", …
$ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not"…
$ race                     <chr> "Black or African American", "Black or Africa…
$ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1…
$ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7…
$ helmet_12m               <chr> "never", "never", "never", "never", "did not …
$ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did not…
$ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, …
$ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",…
$ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, …
$ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"…
# The cases in this data set are US high-schoolers grades 9 through 12. There are 13,583 cases in the sample.

Exercise 2

How many observations are we missing weights from?

yrbss |> skim()
Data summary
Name yrbss
Number of rows 13583
Number of columns 13
_______________________
Column type frequency:
character 8
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gender 12 1.00 4 6 0 2 0
grade 79 0.99 1 5 0 5 0
hispanic 231 0.98 3 8 0 2 0
race 2805 0.79 5 41 0 5 0
helmet_12m 311 0.98 5 12 0 6 0
text_while_driving_30d 918 0.93 1 13 0 8 0
hours_tv_per_school_day 338 0.98 1 12 0 7 0
school_night_hours_sleep 1248 0.91 1 3 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 77 0.99 16.16 1.26 12.00 15.00 16.00 17.00 18.00 ▁▂▅▅▇
height 1004 0.93 1.69 0.10 1.27 1.60 1.68 1.78 2.11 ▁▅▇▃▁
weight 1004 0.93 67.91 16.90 29.94 56.25 64.41 76.20 180.99 ▆▇▂▁▁
physically_active_7d 273 0.98 3.90 2.56 0.00 2.00 4.00 7.00 7.00 ▆▂▅▃▇
strength_training_7d 1176 0.91 2.95 2.58 0.00 0.00 3.00 5.00 7.00 ▇▂▅▂▅
# We are missing weights from 1004 observations.
yrbss1 <- yrbss %>% 
  filter(!is.na(weight))|>
  mutate(physical_3plus = if_else(physically_active_7d > 2, "yes", "no"))|>
  filter(!is.na(physical_3plus))
head(yrbss1)
# A tibble: 6 × 14
    age gender grade hispanic race                      height weight helmet_12m
  <int> <chr>  <chr> <chr>    <chr>                      <dbl>  <dbl> <chr>     
1    15 female 9     hispanic Native Hawaiian or Other…   1.73   84.4 never     
2    15 female 9     not      Black or African American   1.6    55.8 never     
3    15 female 9     not      Black or African American   1.5    46.7 did not r…
4    15 female 9     not      Black or African American   1.57   67.1 did not r…
5    15 female 9     not      Black or African American   1.65  132.  did not r…
6    14 male   9     not      Black or African American   1.88   71.2 never     
# ℹ 6 more variables: text_while_driving_30d <chr>, physically_active_7d <int>,
#   hours_tv_per_school_day <chr>, strength_training_7d <int>,
#   school_night_hours_sleep <chr>, physical_3plus <chr>

Exercise 3

Make a side-by-side violin plots of physical_3plus and weight. Is there a relationship between these two variables? What did you expect and why?

ggplot(yrbss1, aes(physical_3plus, weight, fill(physical_3plus))) +
  geom_point(alpha = .3) +
  geom_violin()

# It seems that those who exercise for three or more days a week match the weight of those that do not in the more average range from 50 - 100, but have less outliers. I would've expected this trend, because more muscle adds weight, but prevents obesity.
yrbss2 <- yrbss1 %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE), count = n())
yrbss2
# A tibble: 2 × 3
  physical_3plus mean_weight count
  <chr>                <dbl> <int>
1 no                    66.7  4022
2 yes                   68.4  8342

Exercise 4

Are all conditions necessary for inference satisfied? Comment on each. You can compute the group sizes with the summarize command above by defining a new variable with the definition n().

# If we assume that the observations are independent, then all conditions are satisfied. There are 4022 failures and 8342 successes, vastly exceeding the requisite 10.

Exercise 5

Write the hypotheses for testing if the average weights are different for those who exercise at least three times a week and those who don’t.

\(H_o: \bar{X}_E = \bar{X}_N\) \(H_a: \bar{X}_E ≠ \bar{X}_N\)

obs_diff <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff
Response: weight (numeric)
Explanatory: physical_3plus (factor)
# A tibble: 1 × 1
   stat
  <dbl>
1  1.77
null_dist <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
null_dist
Response: weight (numeric)
Explanatory: physical_3plus (factor)
Null Hypothesis: independence
# A tibble: 1,000 × 2
   replicate    stat
       <int>   <dbl>
 1         1 -0.165 
 2         2 -0.196 
 3         3 -0.0357
 4         4  0.203 
 5         5  0.420 
 6         6 -0.0647
 7         7 -0.796 
 8         8 -0.401 
 9         9  1.12  
10        10 -0.391 
# ℹ 990 more rows
ggplot(data = null_dist, aes(x = stat)) +
  geom_vline(xintercept = pull(obs_diff)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 6

Add a vertical red line to the plot above, demonstrating where the observed difference in means (obs_diff) falls on the distribution.

Exercise 7

How many of these null_dist permutations have a difference at least as large (or larger) as obs_diff?

# None of these permutations have a difference at least as large as obs_diff.
null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
# A tibble: 1 × 1
  p_value
    <dbl>
1       0

Exercise 8

What warning message do you get? Why do you think you get this warning message?

# We get a warning message that says "Please be cautious in reporting a p-value of 0. This result is an approximation based on the number of `reps` chosen in the `generate()` step." This is because no p-value can truly be 0; there is always a very small chance that the variation within the sample is not due to random chance.

Exercise 9

Construct and record a confidence interval for the difference between the weights of those who exercise at least three times a week and those who don’t, and interpret this interval in context of the data.

boot <- yrbss1|>
  specify(weight ~ physical_3plus)|>
  generate(reps = 500, type = "bootstrap")|>
  calculate(stat = "diff in means")
Warning: The statistic is based on a difference or ratio; by default, for
difference-based statistics, the explanatory variable is subtracted in the
order "no" - "yes", or divided in the order "no" / "yes" for ratio-based
statistics. To specify this order yourself, supply `order = c("no", "yes")` to
the calculate() function.
boot
Response: weight (numeric)
Explanatory: physical_3plus (factor)
# A tibble: 500 × 2
   replicate  stat
       <int> <dbl>
 1         1 -2.54
 2         2 -1.80
 3         3 -2.00
 4         4 -2.11
 5         5 -1.60
 6         6 -2.24
 7         7 -2.23
 8         8 -1.69
 9         9 -1.69
10        10 -1.96
# ℹ 490 more rows
boot_ci <- boot|>
  get_confidence_interval()
Using `level = 0.95` to compute confidence interval.
boot_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    -2.41    -1.12
# We are 95% confident that the difference between the average weights of those who exercise at least three times a week and those who don't is between -2.40 and -1.13. This evidence within the data supports the claim that exercising at least three times a week decreases weight.

Exercise 10

Calculate a 95% confidence interval for the average height in meters (height) and interpret it in context.

boot1 <- yrbss1|>
  specify(response = height)|>
  generate(reps = 500, type = "bootstrap")|>
  calculate(stat = "mean")
boot1
Response: height (numeric)
# A tibble: 500 × 2
   replicate  stat
       <int> <dbl>
 1         1  1.69
 2         2  1.69
 3         3  1.69
 4         4  1.69
 5         5  1.69
 6         6  1.69
 7         7  1.69
 8         8  1.69
 9         9  1.69
10        10  1.69
# ℹ 490 more rows
boot1_ci <- boot1|>
  get_confidence_interval(level = 0.95)
boot1_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69
#We are 95% confident that the average height in meters is between 1.689 and 1.693 meters.

Exercise 11

Calculate a new confidence interval for the same parameter at the 90% confidence level. Comment on the width of this interval versus the one obtained in the previous exercise.

boot1_ci2 <- boot1|>
  get_confidence_interval(level = 0.90)
boot1_ci2
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.69     1.69
# This confidence interval is slightly less wide, due to the decreasing confidence level. However, the two values were already quite close together in the 95% confidence interval, so it can't get too much smaller.

Exercise 12

Conduct a hypothesis test evaluating whether the average height is different for those who exercise at least three times a week and those who don’t.

hypo_test1 <- yrbss1|>
  specify(height ~ physical_3plus)|>
  hypothesize(null = "independence")|>
  generate(reps = 500, type = "permute")|>
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff1 <- yrbss1 %>%
  specify(height ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff1
Response: height (numeric)
Explanatory: physical_3plus (factor)
# A tibble: 1 × 1
    stat
   <dbl>
1 0.0376
hypo_test1|>
  get_p_value(obs_stat = obs_diff1, direction = "two-sided")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
based on the number of `reps` chosen in the `generate()` step.
ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
# A tibble: 1 × 1
  p_value
    <dbl>
1       0
# The p-value is close to 0, and we reject the null. There is a difference in height between those who exercise and those who don't

Exercise 13

Now, a non-inference task: Determine the number of different options there are in the dataset for the hours_tv_per_school_day there are.

unique_levels <- unique(yrbss1$hours_tv_per_school_day)
unique_levels
[1] "5+"           "2"            "3"            "do not watch" "<1"          
[6] "4"            "1"            NA            
# There are 8 different options for the "hours_tv_per_school_day" variable within the dataset.

Exercise 14

Come up with a research question evaluating the relationship between height or weight and sleep. Formulate the question in a way that it can be answered using a hypothesis test and/or a confidence interval. Report the statistical results, and also provide an explanation in plain language. Be sure to check all assumptions, state your α level, and conclude in context.

# First, we change the sleep variable to be numerical.

yrbss3 <- yrbss1 |>
  rename(sleep = school_night_hours_sleep)
yrbss3$sleep<-gsub("[^0-9.-]", "", yrbss3$sleep)
yrbss3$sleep <- as.numeric(yrbss3$sleep)
yrbss3 <- yrbss3|> 
  mutate(sleep_8plus = if_else(sleep >= 8, "yes", "no"))|>
  filter(!is.na(sleep_8plus))


# Research question: does sleeping at least hours a night have an effect on height?

# Check inference standards, assuming that the observations are independent.

yrbss3|>
  group_by(sleep_8plus) %>%
  summarise(mean_height = mean(height, na.rm = TRUE), count = n())
# A tibble: 2 × 3
  sleep_8plus mean_height count
  <chr>             <dbl> <int>
1 no                 1.69  7988
2 yes                1.69  3454
# We have 7988 counts of no and 3454 counts of yes. 

# Create obs_diff:

obs_diff2 <- yrbss3 %>%
  specify(height ~ sleep_8plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff2
Response: height (numeric)
Explanatory: sleep_8plus (factor)
# A tibble: 1 × 1
     stat
    <dbl>
1 0.00200
# Run permutations:

hypo_test2 <- yrbss3|>
  specify(height ~ sleep_8plus)|>
  hypothesize(null = "independence")|>
  generate(reps = 500, type = "permute")|>
  calculate(stat = "diff in means", order = c("yes", "no"))
hypo_test2
Response: height (numeric)
Explanatory: sleep_8plus (factor)
Null Hypothesis: independence
# A tibble: 500 × 2
   replicate      stat
       <int>     <dbl>
 1         1 -0.00166 
 2         2  0.00484 
 3         3  0.00637 
 4         4  0.00131 
 5         5 -0.000361
 6         6  0.00101 
 7         7  0.00472 
 8         8  0.00114 
 9         9 -0.00399 
10        10  0.000863
# ℹ 490 more rows
# Calculate p-value
hypo_test2|>
  get_p_value(obs_stat = obs_diff2, direction = "two sided")
# A tibble: 1 × 1
  p_value
    <dbl>
1   0.308
# Our alpha level is 0.05. We receive a p-value of 0.376, which is greater than the alpha level. We fail to reject the null hypothesis: the data provided gives no evidence of an existing correlation between 8 or more hours of sleep per night and height.