library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
data('yrbss', package='openintro')
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1…
## $ gender                   <chr> "female", "female", "female", "female", "fema…
## $ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", …
## $ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not"…
## $ race                     <chr> "Black or African American", "Black or Africa…
## $ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1…
## $ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7…
## $ helmet_12m               <chr> "never", "never", "never", "never", "did not …
## $ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did not…
## $ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, …
## $ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",…
## $ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, …
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"…

Exercise 1

What are the cases in this data set? How many cases are there in our sample?

The cases in this data set are the people. There are 13583 cases in the sample.

Exercise 2

How many observations are we missing weights from?

summary(yrbss$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   29.94   56.25   64.41   67.91   76.20  180.99    1004

We are missing weights from 1004 people.

Exercise 3

Make a side-by-side boxplot of physical_3plus and weight. Is there a relationship between these two variables? What did you expect and why?

yrbss <- yrbss %>% 
  mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))
ggplot(yrbss, aes(x=weight, y=physical_3plus)) + geom_boxplot() 
## Warning: Removed 1004 rows containing non-finite values (stat_boxplot).

Exercise 4

Are all conditions necessary for inference satisfied? Comment on each. You can compute the group sizes with the summarize command above by defining a new variable with the definition n().

There are 2 conditions for inference

  1. More than 30 samples
  2. Independent sampling

We definitely have more than 30 samples, and we can assume we are independently sampled.

yrbss %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE), count = n())

Exercise 5

Write the hypotheses for testing if the average weights are different for those who exercise at least times a week and those who don’t.

Those who exercise and those who don’t will have the same average weight.

Exercise 6

How many of these null permutations have a difference of at least obs_stat?

There are none

null_dist <- yrbss %>%
  filter(!(is.na(physical_3plus) | is.na(weight))) %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
obs_diff <- yrbss %>%
   filter(!(is.na(physical_3plus) | is.na(weight))) %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.

Exercise 7

Construct and record a confidence interval for the difference between the weights of those who exercise at least three times a week and those who don’t, and interpret this interval in context of the data.

test <- yrbss %>%
  mutate(has_exercise = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))  %>%
  filter(!(is.na(has_exercise) | is.na(has_exercise))) %>%
  filter(!(is.na(weight) | is.na(weight))) %>%
  group_by(has_exercise)     
test
height_data <- summarise(test, n=n(), mean=mean(weight), sd=sd(weight))
height_data

Practically, those that exercise have a higher average weight.

Exercise 8

Calculate a 95% confidence interval for the average height in meters (height) and interpret it in context.

mean_height <- mean(yrbss$height, na.rm = TRUE)
mean_height
## [1] 1.691241
sd_height <- sd(yrbss$height, na.rm = TRUE)
sd_height
## [1] 0.1046973
sde_height <- sd_height / sqrt(13583)
sde_height
## [1] 0.0008983332
tval_height <- qt(.05/2, 13583, lower.tail = FALSE)
ci_up <- mean_height + tval_height * sde_height
leftintheight <- mean_height - tval_height * sde_height
rightintheight<- mean_height + tval_height * sde_height
leftintheight
## [1] 1.68948
rightintheight
## [1] 1.693002

The 95% ci is (1.68948, 1.693002)

Exercise 9

Calculate a new confidence interval for the same parameter at the 90% confidence level. Comment on the width of this interval versus the one obtained in the previous exercise.

mean_height <- mean(yrbss$height, na.rm = TRUE)
mean_height
## [1] 1.691241
sd_height <- sd(yrbss$height, na.rm = TRUE)
sd_height
## [1] 0.1046973
sde_height <- sd_height / sqrt(13583)
sde_height
## [1] 0.0008983332
tval_height <- qt(.1/2, 13583, lower.tail = FALSE)
ci_up <- mean_height + tval_height * sde_height
leftintheight <- mean_height - tval_height * sde_height
rightintheight<- mean_height + tval_height * sde_height
leftintheight
## [1] 1.689763
rightintheight
## [1] 1.692719

The 90% ci is (1.689763, 1.692719)

Exercise 10

Conduct a hypothesis test evaluating whether the average height is different for those who exercise at least three times a week and those who don’t.

Question: Does a exercise encourage more growth? Hypothesis: There is no relationship between hours of exercise and height. Confidence Interval: 95% α level: 0.05

yrbss$night_sleep_hours <-ifelse(yrbss$physically_active_7d > 2, "yes", "no")
t.test(height ~ night_sleep_hours, data = yrbss)
## 
##  Welch Two Sample t-test
## 
## data:  height by night_sleep_hours
## t = -19.029, df = 7973.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -0.04150183 -0.03374994
## sample estimates:
##  mean in group no mean in group yes 
##          1.665587          1.703213

The fact that p < 0.05 means we can reject the null hypothesis.

Exercise 11

Now, a non-inference task: Determine the number of different options there are in the dataset for the hours_tv_per_school_day there are.

yrbss %>% group_by(hours_tv_per_school_day) %>% summarise(n())

There are 8 different options, including na!

Exercise 12

Come up with a research question evaluating the relationship between height or weight and sleep. Formulate the question in a way that it can be answered using a hypothesis test and/or a confidence interval. Report the statistical results, and also provide an explanation in plain language. Be sure to check all assumptions, state your α level, and conclude in context.

Question: Does a full nights sleep encourage more growth? Hypothesis: There is no relationship between hours of sleep and height. Confidence Interval: 95% α level: 0.05

yrbss %>% group_by(school_night_hours_sleep) %>% summarise(n())
yrbss <- yrbss %>%
  mutate(full_sleep = ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no")) %>%
  mutate(full_sleep_binary = ifelse(yrbss$school_night_hours_sleep > 7, "1", "0"))


yrbss <- yrbss %>% 
  mutate(sleep = ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no"))
ggplot(yrbss, aes(x=height, y=sleep)) + geom_boxplot() 
## Warning: Removed 1004 rows containing non-finite values (stat_boxplot).

Looking at it, it appears as there there is a loose correlation, but lets really dig in and figure it out.

yrbss$night_sleep_hours <-ifelse(yrbss$school_night_hours_sleep > 7, "yes", "no")
t.test(height ~ night_sleep_hours, data = yrbss)
## 
##  Welch Two Sample t-test
## 
## data:  height by night_sleep_hours
## t = -1.5598, df = 5814.9, p-value = 0.1189
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -0.0076754326  0.0008733169
## sample estimates:
##  mean in group no mean in group yes 
##          1.689770          1.693171

The P_val is 0.05 so we can reject the null hypothesis.