library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
library(openintro)
## Warning: package 'openintro' was built under R version 4.3.3
## Warning: package 'airports' was built under R version 4.3.3
## Warning: package 'cherryblossom' was built under R version 4.3.3
## Warning: package 'usdata' was built under R version 4.3.3
library(infer)
## Warning: package 'infer' was built under R version 4.3.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
data("yrbss")

Exercise 1

glimpse(yrbss)
## Rows: 13,583
## Columns: 13
## $ age                      <int> 14, 14, 15, 15, 15, 15, 15, 14, 15, 15, 15, 1…
## $ gender                   <chr> "female", "female", "female", "female", "fema…
## $ grade                    <chr> "9", "9", "9", "9", "9", "9", "9", "9", "9", …
## $ hispanic                 <chr> "not", "not", "hispanic", "not", "not", "not"…
## $ race                     <chr> "Black or African American", "Black or Africa…
## $ height                   <dbl> NA, NA, 1.73, 1.60, 1.50, 1.57, 1.65, 1.88, 1…
## $ weight                   <dbl> NA, NA, 84.37, 55.79, 46.72, 67.13, 131.54, 7…
## $ helmet_12m               <chr> "never", "never", "never", "never", "did not …
## $ text_while_driving_30d   <chr> "0", NA, "30", "0", "did not drive", "did not…
## $ physically_active_7d     <int> 4, 2, 7, 0, 2, 1, 4, 4, 5, 0, 0, 0, 4, 7, 7, …
## $ hours_tv_per_school_day  <chr> "5+", "5+", "5+", "2", "3", "5+", "5+", "5+",…
## $ strength_training_7d     <int> 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 3, 0, 0, 7, 7, …
## $ school_night_hours_sleep <chr> "8", "6", "<5", "6", "9", "8", "9", "6", "<5"…
#Each case in this dataset represents a single student, the entire set being comprised of 13,583 cases.

Exercise 2

yrbss %>% 
  skim()
Data summary
Name Piped data
Number of rows 13583
Number of columns 13
_______________________
Column type frequency:
character 8
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gender 12 1.00 4 6 0 2 0
grade 79 0.99 1 5 0 5 0
hispanic 231 0.98 3 8 0 2 0
race 2805 0.79 5 41 0 5 0
helmet_12m 311 0.98 5 12 0 6 0
text_while_driving_30d 918 0.93 1 13 0 8 0
hours_tv_per_school_day 338 0.98 1 12 0 7 0
school_night_hours_sleep 1248 0.91 1 3 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 77 0.99 16.16 1.26 12.00 15.00 16.00 17.00 18.00 ▁▂▅▅▇
height 1004 0.93 1.69 0.10 1.27 1.60 1.68 1.78 2.11 ▁▅▇▃▁
weight 1004 0.93 67.91 16.90 29.94 56.25 64.41 76.20 180.99 ▆▇▂▁▁
physically_active_7d 273 0.98 3.90 2.56 0.00 2.00 4.00 7.00 7.00 ▆▂▅▃▇
strength_training_7d 1176 0.91 2.95 2.58 0.00 0.00 3.00 5.00 7.00 ▇▂▅▂▅
#We are missing weights from 1,219 observations.
yrbss1 <- yrbss %>% 
  mutate(physical_3plus = if_else(physically_active_7d > 2, "yes", "no"))|>
    filter(!is.na(weight) & !is.na(physical_3plus))

Exercise 3

ggplot(data = yrbss1, aes (physical_3plus, weight)) + geom_violin(position = "dodge")

#Yes, there is a relationship between the variables "physical_3plus" and "weight". I expected there to be a greater difference between the plots of the factors since one's physical activity greatly impacts their weight, but there are countless other factors that do so too, so the results are not too shocking.
yrbss1 %>%
  group_by(physical_3plus) %>%
  summarise(mean_weight = mean(weight, na.rm = TRUE))
## # A tibble: 2 × 2
##   physical_3plus mean_weight
##   <chr>                <dbl>
## 1 no                    66.7
## 2 yes                   68.4

Exercise 4

yrbss1 %>%
  group_by(physical_3plus) %>%
  summarize(n = n())
## # A tibble: 2 × 2
##   physical_3plus     n
##   <chr>          <int>
## 1 no              4022
## 2 yes             8342
yrbss1 %>%
  group_by(weight) %>%
  summarize(n = n())
## # A tibble: 239 × 2
##    weight     n
##     <dbl> <int>
##  1   29.9     1
##  2   31.8     1
##  3   33.1     1
##  4   34.0     3
##  5   34.9     1
##  6   35.4     3
##  7   35.8     1
##  8   36.3     7
##  9   36.7     1
## 10   37.2     1
## # ℹ 229 more rows
#The conditions necessary for inference are satisfied: there are more than 10 success and failures for "physical_3plus" and more than 30 numerical entries for "weight".

Exercise 5

#Ho: There is no difference in the average weights of adolescents who exercise at least 3 times a week and those who do not. Ho : mu1 = mu2
# Ha: There is a difference in the average weights of adolescents who exercise at least 3 times a week and those who do not. Ha : mu1 =/ mu2
obs_diff <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
null_dist <- yrbss1 %>%
  specify(weight ~ physical_3plus) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("yes", "no"))
ggplot(data = null_dist, aes(x = stat)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 6

ggplot(data = null_dist, aes(x = stat)) +
   geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

geom_vline(xintercept = 1.774584, color="red")
## mapping: xintercept = ~xintercept 
## geom_vline: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
#The line does not fall on the distribution.

Exercise 7

#None of the "null_dist" permutations have a difference equal to or larger than the "obs_diff" value.
null_dist %>%
  get_p_value(obs_stat = obs_diff, direction = "two_sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Exercise 8

# I got a warning message to be cautious about reporting the p-value that the system procured as it is "0". I believe I got this warning because such a p-value is interpreted as there being absolutely no way that the null hypothesis can fail to be rejected, and such broad statements hardly hold true.

Exercise 9

percentile_ci <- null_dist |>
  get_confidence_interval(level = 0.95)
percentile_ci
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -0.611    0.625
#Given the inclusion of 0 in the confidence interval, we are 95% sure that there is no statistically significant difference between the average weights of adolescents who exercise at least 3 times a week and those who do not.
LS0tDQp0aXRsZTogIkxhYiAxOiBJbnRybyB0byBSIg0KYXV0aG9yOiAiQXV0aG9yIE5hbWUiDQpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkob3BlbmludHJvKQ0KbGlicmFyeShpbmZlcikNCmxpYnJhcnkoc2tpbXIpDQpgYGANCg0KYGBge3J9DQpkYXRhKCJ5cmJzcyIpDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgMQ0KDQpgYGB7cn0NCmdsaW1wc2UoeXJic3MpDQojRWFjaCBjYXNlIGluIHRoaXMgZGF0YXNldCByZXByZXNlbnRzIGEgc2luZ2xlIHN0dWRlbnQsIHRoZSBlbnRpcmUgc2V0IGJlaW5nIGNvbXByaXNlZCBvZiAxMyw1ODMgY2FzZXMuDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgMg0KDQpgYGB7cn0NCnlyYnNzICU+JSANCiAgc2tpbSgpDQojV2UgYXJlIG1pc3Npbmcgd2VpZ2h0cyBmcm9tIDEsMjE5IG9ic2VydmF0aW9ucy4NCmBgYA0KDQoNCmBgYHtyIHRyZW5kLWdpcmxzfQ0KeXJic3MxIDwtIHlyYnNzICU+JSANCiAgbXV0YXRlKHBoeXNpY2FsXzNwbHVzID0gaWZfZWxzZShwaHlzaWNhbGx5X2FjdGl2ZV83ZCA+IDIsICJ5ZXMiLCAibm8iKSl8Pg0KICAgIGZpbHRlcighaXMubmEod2VpZ2h0KSAmICFpcy5uYShwaHlzaWNhbF8zcGx1cykpDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgMw0KDQpgYGB7cn0NCmdncGxvdChkYXRhID0geXJic3MxLCBhZXMgKHBoeXNpY2FsXzNwbHVzLCB3ZWlnaHQpKSArIGdlb21fdmlvbGluKHBvc2l0aW9uID0gImRvZGdlIikNCmBgYA0KDQpgYGB7cn0NCiNZZXMsIHRoZXJlIGlzIGEgcmVsYXRpb25zaGlwIGJldHdlZW4gdGhlIHZhcmlhYmxlcyAicGh5c2ljYWxfM3BsdXMiIGFuZCAid2VpZ2h0Ii4gSSBleHBlY3RlZCB0aGVyZSB0byBiZSBhIGdyZWF0ZXIgZGlmZmVyZW5jZSBiZXR3ZWVuIHRoZSBwbG90cyBvZiB0aGUgZmFjdG9ycyBzaW5jZSBvbmUncyBwaHlzaWNhbCBhY3Rpdml0eSBncmVhdGx5IGltcGFjdHMgdGhlaXIgd2VpZ2h0LCBidXQgdGhlcmUgYXJlIGNvdW50bGVzcyBvdGhlciBmYWN0b3JzIHRoYXQgZG8gc28gdG9vLCBzbyB0aGUgcmVzdWx0cyBhcmUgbm90IHRvbyBzaG9ja2luZy4NCmBgYA0KDQpgYGB7cn0NCnlyYnNzMSAlPiUNCiAgZ3JvdXBfYnkocGh5c2ljYWxfM3BsdXMpICU+JQ0KICBzdW1tYXJpc2UobWVhbl93ZWlnaHQgPSBtZWFuKHdlaWdodCwgbmEucm0gPSBUUlVFKSkNCmBgYA0KDQoNCg0KIyMjIEV4ZXJjaXNlIDQNCg0KDQpgYGB7cn0NCnlyYnNzMSAlPiUNCiAgZ3JvdXBfYnkocGh5c2ljYWxfM3BsdXMpICU+JQ0KICBzdW1tYXJpemUobiA9IG4oKSkNCmBgYA0KDQpgYGB7cn0NCnlyYnNzMSAlPiUNCiAgZ3JvdXBfYnkod2VpZ2h0KSAlPiUNCiAgc3VtbWFyaXplKG4gPSBuKCkpDQpgYGANCg0KYGBge3J9DQojVGhlIGNvbmRpdGlvbnMgbmVjZXNzYXJ5IGZvciBpbmZlcmVuY2UgYXJlIHNhdGlzZmllZDogdGhlcmUgYXJlIG1vcmUgdGhhbiAxMCBzdWNjZXNzIGFuZCBmYWlsdXJlcyBmb3IgInBoeXNpY2FsXzNwbHVzIiBhbmQgbW9yZSB0aGFuIDMwIG51bWVyaWNhbCBlbnRyaWVzIGZvciAid2VpZ2h0Ii4NCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSA1DQoNCg0KYGBge3J9DQojSG86IFRoZXJlIGlzIG5vIGRpZmZlcmVuY2UgaW4gdGhlIGF2ZXJhZ2Ugd2VpZ2h0cyBvZiBhZG9sZXNjZW50cyB3aG8gZXhlcmNpc2UgYXQgbGVhc3QgMyB0aW1lcyBhIHdlZWsgYW5kIHRob3NlIHdobyBkbyBub3QuIEhvIDogbXUxID0gbXUyDQojIEhhOiBUaGVyZSBpcyBhIGRpZmZlcmVuY2UgaW4gdGhlIGF2ZXJhZ2Ugd2VpZ2h0cyBvZiBhZG9sZXNjZW50cyB3aG8gZXhlcmNpc2UgYXQgbGVhc3QgMyB0aW1lcyBhIHdlZWsgYW5kIHRob3NlIHdobyBkbyBub3QuIEhhIDogbXUxID0vIG11Mg0KYGBgDQoNCmBgYHtyfQ0Kb2JzX2RpZmYgPC0geXJic3MxICU+JQ0KICBzcGVjaWZ5KHdlaWdodCB+IHBoeXNpY2FsXzNwbHVzKSAlPiUNCiAgY2FsY3VsYXRlKHN0YXQgPSAiZGlmZiBpbiBtZWFucyIsIG9yZGVyID0gYygieWVzIiwgIm5vIikpDQpgYGANCg0KYGBge3J9DQpudWxsX2Rpc3QgPC0geXJic3MxICU+JQ0KICBzcGVjaWZ5KHdlaWdodCB+IHBoeXNpY2FsXzNwbHVzKSAlPiUNCiAgaHlwb3RoZXNpemUobnVsbCA9ICJpbmRlcGVuZGVuY2UiKSAlPiUNCiAgZ2VuZXJhdGUocmVwcyA9IDEwMDAsIHR5cGUgPSAicGVybXV0ZSIpICU+JQ0KICBjYWxjdWxhdGUoc3RhdCA9ICJkaWZmIGluIG1lYW5zIiwgb3JkZXIgPSBjKCJ5ZXMiLCAibm8iKSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChkYXRhID0gbnVsbF9kaXN0LCBhZXMoeCA9IHN0YXQpKSArDQogIGdlb21faGlzdG9ncmFtKCkNCmBgYA0KDQojIyMgRXhlcmNpc2UgNg0KDQpgYGB7cn0NCmdncGxvdChkYXRhID0gbnVsbF9kaXN0LCBhZXMoeCA9IHN0YXQpKSArDQogICBnZW9tX2hpc3RvZ3JhbSgpDQpnZW9tX3ZsaW5lKHhpbnRlcmNlcHQgPSAxLjc3NDU4NCwgY29sb3I9InJlZCIpDQojVGhlIGxpbmUgZG9lcyBub3QgZmFsbCBvbiB0aGUgZGlzdHJpYnV0aW9uLg0KYGBgDQoNCg0KIyMjIEV4ZXJjaXNlIDcNCg0KDQpgYGB7cn0NCiNOb25lIG9mIHRoZSAibnVsbF9kaXN0IiBwZXJtdXRhdGlvbnMgaGF2ZSBhIGRpZmZlcmVuY2UgZXF1YWwgdG8gb3IgbGFyZ2VyIHRoYW4gdGhlICJvYnNfZGlmZiIgdmFsdWUuDQpgYGANCg0KYGBge3J9DQpudWxsX2Rpc3QgJT4lDQogIGdldF9wX3ZhbHVlKG9ic19zdGF0ID0gb2JzX2RpZmYsIGRpcmVjdGlvbiA9ICJ0d29fc2lkZWQiKQ0KYGBgDQoNCg0KIyMjIEV4ZXJjaXNlIDgNCg0KYGBge3J9DQojIEkgZ290IGEgd2FybmluZyBtZXNzYWdlIHRvIGJlIGNhdXRpb3VzIGFib3V0IHJlcG9ydGluZyB0aGUgcC12YWx1ZSB0aGF0IHRoZSBzeXN0ZW0gcHJvY3VyZWQgYXMgaXQgaXMgIjAiLiBJIGJlbGlldmUgSSBnb3QgdGhpcyB3YXJuaW5nIGJlY2F1c2Ugc3VjaCBhIHAtdmFsdWUgaXMgaW50ZXJwcmV0ZWQgYXMgdGhlcmUgYmVpbmcgYWJzb2x1dGVseSBubyB3YXkgdGhhdCB0aGUgbnVsbCBoeXBvdGhlc2lzIGNhbiBmYWlsIHRvIGJlIHJlamVjdGVkLCBhbmQgc3VjaCBicm9hZCBzdGF0ZW1lbnRzIGhhcmRseSBob2xkIHRydWUuDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgOQ0KDQpgYGB7cn0NCnBlcmNlbnRpbGVfY2kgPC0gbnVsbF9kaXN0IHw+DQogIGdldF9jb25maWRlbmNlX2ludGVydmFsKGxldmVsID0gMC45NSkNCnBlcmNlbnRpbGVfY2kNCmBgYA0KDQpgYGB7cn0NCiNHaXZlbiB0aGUgaW5jbHVzaW9uIG9mIDAgaW4gdGhlIGNvbmZpZGVuY2UgaW50ZXJ2YWwsIHdlIGFyZSA5NSUgc3VyZSB0aGF0IHRoZXJlIGlzIG5vIHN0YXRpc3RpY2FsbHkgc2lnbmlmaWNhbnQgZGlmZmVyZW5jZSBiZXR3ZWVuIHRoZSBhdmVyYWdlIHdlaWdodHMgb2YgYWRvbGVzY2VudHMgd2hvIGV4ZXJjaXNlIGF0IGxlYXN0IDMgdGltZXMgYSB3ZWVrIGFuZCB0aG9zZSB3aG8gZG8gbm90Lg0KYGBgDQoNCg==