Exercise 1

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1…
## $ gender                   <chr> "female", "female", "female", "female", "fema…
## $ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", …
## $ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not"…
## $ race                     <chr> "Black or African American", "Black or Africa…
## $ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1…
## $ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7…
## $ helmet_12m               <chr> "never", "never", "never", "never", "did not …
## $ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did not…
## $ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, …
## $ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",…
## $ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, …
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"…

Exercise 2

summary(yrbss$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   29.94   56.25   64.41   67.91   76.20  180.99    1004

We are missing data from 1004 observations.

Exerccise 3

yrbss <- yrbss %>% 
  mutate(physical_3plus = ifelse(yrbss$physically_active_7d > 2, "yes", "no"))

ggplot(yrbss, aes(x=weight, y=physical_3plus)) + geom_boxplot() 
## Warning: Removed 1004 rows containing non-finite values (stat_boxplot).

It seems that the median of weight of those who were physically active 3 days a week is slightly higher than that of those who were not physically active. This is not as expected as most people who are physically active are so so they can lose weight. This may be because physically active may mean lifting weights which is known to add weight.

Exercise 4

In order for inference to be satisfied, the data must be normal and independent. It is normal in that we have more than 30 values recorded. It is independent in that a respondents physical weight is not directly linked to them not

Exercise 5

The null hypothesis is that there is no difference in those who exercise three times a week and those who don’t.

Exercise 6

obs_diff <- na.omit(yrbss) %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no")) 

null_dist <- yrbss %>% na.omit(yrbss) %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

null_dist %>% filter(stat >= obs_diff) %>% nrow()
## [1] 0

There are no permutations that have a difference of at least obs_stat.

Exercise 7

phys3 <- filter(yrbss, yrbss$physical_3plus == "yes")

mean_phys3weight<-mean(phys3$weight, na.rm =TRUE)
sd_phys3weight<-sd(phys3$weight, na.rm=TRUE)

yes_n = nrow(phys3)


upperphys3 <- mean_phys3weight+1.96*(sd_phys3weight/sqrt(yes_n))
lowerphys3 <- mean_phys3weight-1.96*(sd_phys3weight/sqrt(yes_n))


nophys3 <- filter(yrbss, yrbss$physical_3plus == "no")
mean_nophys3weight<-mean(nophys3$weight, na.rm =TRUE)
sd_nophys3weight<-sd(nophys3$weight, na.rm=TRUE)

no_n = nrow(nophys3)

uppernophys3 <- mean_nophys3weight+1.96*(sd_nophys3weight/sqrt(no_n))
lowernophys3 <- mean_nophys3weight-1.96*(sd_nophys3weight/sqrt(no_n))

lowerphys3
## [1] 68.10623
upperphys3
## [1] 68.79071
lowernophys3 
## [1] 66.15295
uppernophys3
## [1] 67.19482

We can ve 95% confident that the average weight of those who exercise at least 3 times a week is from 68.10623 kgs to 68.79071 and for those who don’t being from 66.15295 to 67.19482 kgs.

Exercise 8

mean_height <- mean(yrbss$height, na.rm = TRUE)
sd_height <- sd(yrbss$height, na.rm = TRUE)

n <- nrow(yrbss)

upperheight <- mean_height+1.96*(sd_height/sqrt(n))
lowerheight <- mean_height-1.96*(sd_height/sqrt(n))

upperheight
## [1] 1.693002
lowerheight
## [1] 1.68948

It is 95% confident that the average height of the students is between 1.689 and 1.693 m.

Exercise 9

upperheight <- mean_height+1.692*(sd_height/sqrt(n))
lowerheight <- mean_height-1.692*(sd_height/sqrt(n))
upperheight
## [1] 1.692761
lowerheight
## [1] 1.689721

The interval for 90% is slightly narrower than the interval previously. This makes sense as with a smaller margin, we are not as sure that the mean will lie there.

Exercise 10

The null hypothesis is that there is no difference in the average height of physically active than those who aren’t.

Alternative hypothesis is that there is a difference in height between those who are physically active and aren’t.

obs_diff_hgt <- yrbss %>% na.omit(yrbss) %>%
  specify(height ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

null_dist_hgt <- yrbss %>% na.omit(yrbss) %>%
  specify(height ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))

visualize(null_dist_hgt) + 
  shade_p_value(obs_stat = obs_diff_hgt, direction = "two_sided")

null_dist_hgt %>%
  get_p_value(obs_stat = obs_diff_hgt, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step. See
## `?get_p_value()` for more information.
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
phys3 <- filter(yrbss, yrbss$physical_3plus == "yes")

mean_phys3height<-mean(phys3$height, na.rm =TRUE)
sd_phys3height<-sd(phys3$height, na.rm=TRUE)

upperheight <- mean_height+1.96*(sd_height/sqrt(n))
lowerheight <- mean_height-1.96*(sd_height/sqrt(n))
upperheight
## [1] 1.693002
lowerheight
## [1] 1.68948
n = nrow(phys3)



nophys3 <- filter(yrbss, yrbss$physical_3plus == "no")
mean_nophys3height<-mean(nophys3$height, na.rm =TRUE)
sd_nophys3height<-sd(nophys3$height, na.rm=TRUE)

no_n = nrow(nophys3)

uppernophys3h <- mean_nophys3height+1.96*(sd_nophys3height/sqrt(no_n))
lowernophys3h <- mean_nophys3height-1.96*(sd_nophys3height/sqrt(no_n))

uppernophys3h
## [1] 1.668625
lowernophys3h
## [1] 1.662549

Since the p value is miniscule, the null value is rejected and the difference in heights of those who exercise is maintained and based on the confidence interval, those who exercise are generally taller.

Exercise 11

table(yrbss$hours_tv_per_school_day)
## 
##           <1            1            2            3            4           5+ 
##         2168         1750         2705         2139         1048         1595 
## do not watch 
##         1840

there are 7 different options for hours of tv per school day.

Exercise 12

yrbss <- yrbss %>%
  mutate(sleep_6plus = ifelse(yrbss$school_night_hours_sleep > 5, "yes", "no"))

heightlesssleep <- yrbss %>% 
  select(height, sleep_6plus) %>% 
  filter(sleep_6plus == "no") %>%
  na.omit()

heightmoresleep <- yrbss %>% 
  select(height, sleep_6plus) %>% 
  filter(sleep_6plus == "yes") %>%
  na.omit()

Research question: Do those who have 8 or more hours of sleep a night during school days taller than those who don’t?

null hypothesis: There is no relationship between height and sleeping for more than 8 hours.

alternative: There is a relationship between height and sleeping more than 8 hours.

\(\alpha\) = .95

sleep8 <- filter(yrbss, yrbss$school_night_hours_sleep >= 8)


mean_sleep8<-mean(sleep8$height, na.rm =TRUE)
sd_sleep8<-sd(phys3$height, na.rm=TRUE)

f = nrow(sleep8)


uppersleep8 <- mean_sleep8+1.96*(sd_sleep8/sqrt(f))
lowersleep8 <- mean_sleep8-1.96*(sd_sleep8/sqrt(f))


nosleep8 <- filter(yrbss, yrbss$school_night_hours_sleep < 8)
mean_nosleep8<-mean(nosleep8$height, na.rm =TRUE)
sd_nosleep8<-sd(nosleep8$height, na.rm=TRUE)

fn = nrow(nosleep8)

uppernosleep8 <- mean_nosleep8+1.96*(sd_nosleep8/sqrt(fn))
lowernosleep8 <- mean_nosleep8-1.96*(sd_nosleep8/sqrt(fn))

lowersleep8
## [1] 1.689727
uppersleep8
## [1] 1.696616
lowernosleep8 
## [1] 1.687598
uppernosleep8
## [1] 1.691943

Based on the confidence interval of 95%, those who sleep 8 hours range from 1.689 to 1.696 as opposed to those who don’t which range from 1.687 to 1.691. The null hypothesis is rejected as there is a slight increase in height of those who sleep 8 hours a night on a school night than those who don’t.