library(tidyverse)
library(openintro)
library(mosaic)
library(ggformula)

Exercise 1

By dropping the cases that are missing, some data was removed. Without dropping these it would be hard to rerun certain codes since not all data was there before dropping the null values.

Youth<-yrbss%>%
  filter(!is.na(hispanic) & !is.na(strength_training_7d))

Exercise 2

The mean of this would be 2.94 and the standard deviation would be 2.57. This data is discrete since there aren’t any decimals and exclusively whole values. This is asymmetric since the sides are both different. There isn’t really a skew to a certain side though.

table(Youth$hispanic)
## 
## hispanic      not 
##     3093     9103
table(Youth$strength_training_7d)
## 
##    0    1    2    3    4    5    6    7 
## 3583  994 1283 1446 1041 1308  507 2034
favstats(~strength_training_7d,data=Youth)
##  min Q1 median Q3 max     mean       sd     n missing
##    0  0      3  5   7 2.942112 2.574756 12196       0
histogram(Youth$strength_training_7d)

Exercise 3

The values would not be the same each time since they are samples. Since we are only sampling 10 random individuals the value is able to be much greater or lower than the population paramaters. Since 25.36 is the actual parameters, it wouldn’t be very close when it is .1 or .4 like some things appear as. The only way is would be the same at each sample is through setting a seed value.

sample_stats_once <- function(data, n,
                              hisp = hispanic,
                              strength = `strength_training_7d`,
                              replace = FALSE) {
  data %>%
    slice_sample(n = n, replace = replace) %>%     # sampling
    summarise(
      prop_hispanic = mean({{hisp}} == "hispanic", na.rm = TRUE),
      mean_strength = mean({{strength}}, na.rm = TRUE),
      sd_strength   = sd({{strength}}, na.rm = TRUE)
    )
}

sample_stats_once(Youth, n=10)
## # A tibble: 1 × 3
##   prop_hispanic mean_strength sd_strength
##           <dbl>         <dbl>       <dbl>
## 1           0.1           2.4        3.06

Exercise 4

When running this the probability decreased the higher the number got in our samples. The mean strength decreased going from 27 to 40 and 40 to 100 but increased very slightly from 100 to 200. There was a decrease in standard deviation strength from 27 to 40 and from 100 to 200 but an increase from 40 to 100 random samples.

sample_stats_once(Youth, n=200)
## # A tibble: 1 × 3
##   prop_hispanic mean_strength sd_strength
##           <dbl>         <dbl>       <dbl>
## 1          0.29          2.68        2.53
sample_stats_once(Youth, n=27)
## # A tibble: 1 × 3
##   prop_hispanic mean_strength sd_strength
##           <dbl>         <dbl>       <dbl>
## 1         0.444          3.04        2.23
sample_stats_once(Youth, n=40)
## # A tibble: 1 × 3
##   prop_hispanic mean_strength sd_strength
##           <dbl>         <dbl>       <dbl>
## 1           0.3           2.9        2.69
sample_stats_once(Youth, n=100)
## # A tibble: 1 × 3
##   prop_hispanic mean_strength sd_strength
##           <dbl>         <dbl>       <dbl>
## 1          0.34          3.22        2.77

Exercise 5

Both the standard deviations and means on the sample vs the random sample of the sample are basically the same.When referring to the histogram, the data is skewed to the right so it would not be classified as normal.

n <- 10        # sample size you want
reps <- 50000     # number of replications

set.seed(2025) # common seed so we all get the same 50000 samples
results_10 <- map_dfr(
  seq_len(reps),
  ~sample_stats_once(Youth, n),
  .id = "replicate"
) %>%
  mutate(replicate = as.integer(replicate))

Exercise 6

Since we added more random subjects, the mean becmame much closer to the real mean when comparing. The standard deviation also lower since we added more random sample subjects.

# Inspect
glimpse(results_10)
## Rows: 50,000
## Columns: 4
## $ replicate     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ prop_hispanic <dbl> 0.3, 0.2, 0.5, 0.4, 0.3, 0.0, 0.3, 0.2, 0.1, 0.5, 0.4, 0…
## $ mean_strength <dbl> 3.9, 1.0, 3.6, 2.6, 2.5, 4.8, 3.0, 3.5, 2.8, 3.0, 3.6, 2…
## $ sd_strength   <dbl> 2.960856, 1.247219, 2.796824, 2.796824, 2.368778, 2.2509…
head(results_10)
## # A tibble: 6 × 4
##   replicate prop_hispanic mean_strength sd_strength
##       <int>         <dbl>         <dbl>       <dbl>
## 1         1           0.3           3.9        2.96
## 2         2           0.2           1          1.25
## 3         3           0.5           3.6        2.80
## 4         4           0.4           2.6        2.80
## 5         5           0.3           2.5        2.37
## 6         6           0             4.8        2.25
favstats(~prop_hispanic,data=results_10)
##  min  Q1 median  Q3 max    mean       sd     n missing
##    0 0.2    0.2 0.3 0.8 0.25365 0.137824 50000       0
gf_histogram(~prop_hispanic,data=results_10,binwidth=0.05)

sqrt(.256*(1-.256)/10)
## [1] 0.1380087

Exercise 6

The mean is the same but the standard deviation is a lot lower because the sample size of 40 is larger than the sample size of 10 from before.

n <- 40        # sample size you want
reps <- 50000     # number of replications

set.seed(2025) # common seed so we all get the same 50000 samples
results_40 <- map_dfr(
  seq_len(reps),
  ~sample_stats_once(Youth, n),
  .id = "replicate"
) %>%
  mutate(replicate = as.integer(replicate))

head(results_40)
## # A tibble: 6 × 4
##   replicate prop_hispanic mean_strength sd_strength
##       <int>         <dbl>         <dbl>       <dbl>
## 1         1          0.35          2.78        2.70
## 2         2          0.2           3.45        2.55
## 3         3          0.3           2.88        2.47
## 4         4          0.2           3.35        2.65
## 5         5          0.1           3.02        2.76
## 6         6          0.25          2.38        2.11
favstats(~prop_hispanic,data=results_40)
##    min  Q1 median  Q3   max     mean        sd     n missing
##  0.025 0.2   0.25 0.3 0.575 0.253553 0.0688607 50000       0
gf_histogram(~prop_hispanic,data=results_40,binwidth=0.05)

Exercise 7

The mean is basically the same compared to the population average days of strength training.This histogram is a lot more symmetrical than the population histogram and isn’t skewed in either direction.

favstats(~mean_strength,data=results_10)
##  min  Q1 median  Q3 max     mean        sd     n missing
##  0.1 2.4    2.9 3.5 6.1 2.941948 0.8152654 50000       0
gf_histogram(~mean_strength,data=results_10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 8

The number for results_10 is .04358 while the number for results_40 was .01089. The result_40 value was a lot lower since the sample size was much higher than 10. The average population is represented by the mean. The population of these two numbers are very similar at 25.36 (results_10) and 25.37% (results_40).

0.137824/sqrt(10)
## [1] 0.04358378
0.0688607/sqrt(40)
## [1] 0.01088783

Exercise 9

The mean values were pretty similar when compared to the standard deviation in size meaning it is unbiased. The means were at 2.54 and 2.56 and the standard deviation previosuly was 2.57.

favstats(results_10$sd_strength)
##        min       Q1   median       Q3      max     mean        sd     n missing
##  0.3162278 2.299758 2.581989 2.830391 3.689324 2.540862 0.4078629 50000       0
favstats(results_40$sd_strength)
##       min       Q1   median       Q3      max     mean       sd     n missing
##  1.721657 2.454588 2.575899 2.690915 3.178352 2.568947 0.176705 50000       0
LS0tCnRpdGxlOiAiTGFiIDMiCmF1dGhvcjogIk1ha2VubmEgUmFhYmUgYW5kIE1pc3RhIEJhcm5lcyIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydAotLS0KCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG9wZW5pbnRybykKbGlicmFyeShtb3NhaWMpCmxpYnJhcnkoZ2dmb3JtdWxhKQpgYGAKCiMjIyBFeGVyY2lzZSAxCkJ5IGRyb3BwaW5nIHRoZSBjYXNlcyB0aGF0IGFyZSBtaXNzaW5nLCBzb21lIGRhdGEgd2FzIHJlbW92ZWQuIFdpdGhvdXQgZHJvcHBpbmcgdGhlc2UgaXQgd291bGQgYmUgaGFyZCB0byByZXJ1biBjZXJ0YWluIGNvZGVzIHNpbmNlIG5vdCBhbGwgZGF0YSB3YXMgdGhlcmUgYmVmb3JlIGRyb3BwaW5nIHRoZSBudWxsIHZhbHVlcy4gCgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsMX0KWW91dGg8LXlyYnNzJT4lCiAgZmlsdGVyKCFpcy5uYShoaXNwYW5pYykgJiAhaXMubmEoc3RyZW5ndGhfdHJhaW5pbmdfN2QpKQpgYGAKCiMjIyBFeGVyY2lzZSAyClRoZSBtZWFuIG9mIHRoaXMgd291bGQgYmUgMi45NCBhbmQgdGhlIHN0YW5kYXJkIGRldmlhdGlvbiB3b3VsZCBiZSAyLjU3LiBUaGlzIGRhdGEgaXMgZGlzY3JldGUgc2luY2UgdGhlcmUgYXJlbid0IGFueSBkZWNpbWFscyBhbmQgZXhjbHVzaXZlbHkgd2hvbGUgdmFsdWVzLiBUaGlzIGlzIGFzeW1tZXRyaWMgc2luY2UgdGhlIHNpZGVzIGFyZSBib3RoIGRpZmZlcmVudC4gVGhlcmUgaXNuJ3QgcmVhbGx5IGEgc2tldyB0byBhIGNlcnRhaW4gc2lkZSB0aG91Z2guCgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsMn0KCnRhYmxlKFlvdXRoJGhpc3BhbmljKQp0YWJsZShZb3V0aCRzdHJlbmd0aF90cmFpbmluZ183ZCkKZmF2c3RhdHMofnN0cmVuZ3RoX3RyYWluaW5nXzdkLGRhdGE9WW91dGgpCmhpc3RvZ3JhbShZb3V0aCRzdHJlbmd0aF90cmFpbmluZ183ZCkKYGBgCgojIyMgRXhlcmNpc2UgMwpUaGUgdmFsdWVzIHdvdWxkIG5vdCBiZSB0aGUgc2FtZSBlYWNoIHRpbWUgc2luY2UgdGhleSBhcmUgc2FtcGxlcy4gU2luY2Ugd2UgYXJlIG9ubHkgc2FtcGxpbmcgMTAgcmFuZG9tIGluZGl2aWR1YWxzIHRoZSB2YWx1ZSBpcyBhYmxlIHRvIGJlIG11Y2ggZ3JlYXRlciBvciBsb3dlciB0aGFuIHRoZSBwb3B1bGF0aW9uIHBhcmFtYXRlcnMuIFNpbmNlIDI1LjM2IGlzIHRoZSBhY3R1YWwgcGFyYW1ldGVycywgaXQgd291bGRuJ3QgYmUgdmVyeSBjbG9zZSB3aGVuIGl0IGlzIC4xIG9yIC40IGxpa2Ugc29tZSB0aGluZ3MgYXBwZWFyIGFzLiBUaGUgb25seSB3YXkgaXMgd291bGQgYmUgdGhlIHNhbWUgYXQgZWFjaCBzYW1wbGUgaXMgdGhyb3VnaCBzZXR0aW5nIGEgc2VlZCB2YWx1ZS4gCgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsM30Kc2FtcGxlX3N0YXRzX29uY2UgPC0gZnVuY3Rpb24oZGF0YSwgbiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgaGlzcCA9IGhpc3BhbmljLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJlbmd0aCA9IGBzdHJlbmd0aF90cmFpbmluZ183ZGAsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHJlcGxhY2UgPSBGQUxTRSkgewogIGRhdGEgJT4lCiAgICBzbGljZV9zYW1wbGUobiA9IG4sIHJlcGxhY2UgPSByZXBsYWNlKSAlPiUgICAgICMgc2FtcGxpbmcKICAgIHN1bW1hcmlzZSgKICAgICAgcHJvcF9oaXNwYW5pYyA9IG1lYW4oe3toaXNwfX0gPT0gImhpc3BhbmljIiwgbmEucm0gPSBUUlVFKSwKICAgICAgbWVhbl9zdHJlbmd0aCA9IG1lYW4oe3tzdHJlbmd0aH19LCBuYS5ybSA9IFRSVUUpLAogICAgICBzZF9zdHJlbmd0aCAgID0gc2Qoe3tzdHJlbmd0aH19LCBuYS5ybSA9IFRSVUUpCiAgICApCn0KCnNhbXBsZV9zdGF0c19vbmNlKFlvdXRoLCBuPTEwKQoKYGBgCiMjIyBFeGVyY2lzZSA0CldoZW4gcnVubmluZyB0aGlzIHRoZSBwcm9iYWJpbGl0eSBkZWNyZWFzZWQgdGhlIGhpZ2hlciB0aGUgbnVtYmVyIGdvdCBpbiBvdXIgc2FtcGxlcy4gVGhlIG1lYW4gc3RyZW5ndGggZGVjcmVhc2VkIGdvaW5nIGZyb20gMjcgdG8gNDAgYW5kIDQwIHRvIDEwMCBidXQgaW5jcmVhc2VkIHZlcnkgc2xpZ2h0bHkgZnJvbSAxMDAgdG8gMjAwLiBUaGVyZSB3YXMgYSBkZWNyZWFzZSBpbiBzdGFuZGFyZCBkZXZpYXRpb24gc3RyZW5ndGggZnJvbSAyNyB0byA0MCBhbmQgZnJvbSAxMDAgdG8gMjAwIGJ1dCBhbiBpbmNyZWFzZSBmcm9tIDQwIHRvIDEwMCByYW5kb20gc2FtcGxlcy4gCgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsNH0Kc2FtcGxlX3N0YXRzX29uY2UoWW91dGgsIG49MjAwKQpzYW1wbGVfc3RhdHNfb25jZShZb3V0aCwgbj0yNykKc2FtcGxlX3N0YXRzX29uY2UoWW91dGgsIG49NDApCnNhbXBsZV9zdGF0c19vbmNlKFlvdXRoLCBuPTEwMCkKCmBgYAojIyMgRXhlcmNpc2UgNQoKQm90aCB0aGUgc3RhbmRhcmQgZGV2aWF0aW9ucyBhbmQgbWVhbnMgb24gdGhlIHNhbXBsZSB2cyB0aGUgcmFuZG9tIHNhbXBsZSBvZiB0aGUgc2FtcGxlIGFyZSBiYXNpY2FsbHkgdGhlIHNhbWUuV2hlbiByZWZlcnJpbmcgdG8gdGhlIGhpc3RvZ3JhbSwgdGhlIGRhdGEgaXMgc2tld2VkIHRvIHRoZSByaWdodCBzbyBpdCB3b3VsZCBub3QgYmUgY2xhc3NpZmllZCBhcyBub3JtYWwuCmBgYHtyIGNvZGUtY2h1bmstbGFiZWw1fQpuIDwtIDEwICAgICAgICAjIHNhbXBsZSBzaXplIHlvdSB3YW50CnJlcHMgPC0gNTAwMDAgICAgICMgbnVtYmVyIG9mIHJlcGxpY2F0aW9ucwoKc2V0LnNlZWQoMjAyNSkgIyBjb21tb24gc2VlZCBzbyB3ZSBhbGwgZ2V0IHRoZSBzYW1lIDUwMDAwIHNhbXBsZXMKcmVzdWx0c18xMCA8LSBtYXBfZGZyKAogIHNlcV9sZW4ocmVwcyksCiAgfnNhbXBsZV9zdGF0c19vbmNlKFlvdXRoLCBuKSwKICAuaWQgPSAicmVwbGljYXRlIgopICU+JQogIG11dGF0ZShyZXBsaWNhdGUgPSBhcy5pbnRlZ2VyKHJlcGxpY2F0ZSkpCmBgYAojIyMgRXhlcmNpc2UgNgpTaW5jZSB3ZSBhZGRlZCBtb3JlIHJhbmRvbSBzdWJqZWN0cywgdGhlIG1lYW4gYmVjbWFtZSBtdWNoIGNsb3NlciB0byB0aGUgcmVhbCBtZWFuIHdoZW4gY29tcGFyaW5nLiBUaGUgc3RhbmRhcmQgZGV2aWF0aW9uIGFsc28gbG93ZXIgc2luY2Ugd2UgYWRkZWQgbW9yZSByYW5kb20gc2FtcGxlIHN1YmplY3RzLiAKYGBgIHtyfQojIEluc3BlY3QKZ2xpbXBzZShyZXN1bHRzXzEwKQoKaGVhZChyZXN1bHRzXzEwKQoKZmF2c3RhdHMofnByb3BfaGlzcGFuaWMsZGF0YT1yZXN1bHRzXzEwKQoKZ2ZfaGlzdG9ncmFtKH5wcm9wX2hpc3BhbmljLGRhdGE9cmVzdWx0c18xMCxiaW53aWR0aD0wLjA1KQoKc3FydCguMjU2KigxLS4yNTYpLzEwKQpgYGAKIyMjIEV4ZXJjaXNlIDYKVGhlIG1lYW4gaXMgdGhlIHNhbWUgYnV0IHRoZSBzdGFuZGFyZCBkZXZpYXRpb24gaXMgYSBsb3QgbG93ZXIgYmVjYXVzZSB0aGUgc2FtcGxlIHNpemUgb2YgNDAgaXMgbGFyZ2VyIHRoYW4gdGhlIHNhbXBsZSBzaXplIG9mIDEwIGZyb20gYmVmb3JlLiAKCmBgYHtyIGNvZGUtY2h1bmstbGFiZWw2fQpuIDwtIDQwICAgICAgICAjIHNhbXBsZSBzaXplIHlvdSB3YW50CnJlcHMgPC0gNTAwMDAgICAgICMgbnVtYmVyIG9mIHJlcGxpY2F0aW9ucwoKc2V0LnNlZWQoMjAyNSkgIyBjb21tb24gc2VlZCBzbyB3ZSBhbGwgZ2V0IHRoZSBzYW1lIDUwMDAwIHNhbXBsZXMKcmVzdWx0c180MCA8LSBtYXBfZGZyKAogIHNlcV9sZW4ocmVwcyksCiAgfnNhbXBsZV9zdGF0c19vbmNlKFlvdXRoLCBuKSwKICAuaWQgPSAicmVwbGljYXRlIgopICU+JQogIG11dGF0ZShyZXBsaWNhdGUgPSBhcy5pbnRlZ2VyKHJlcGxpY2F0ZSkpCgpoZWFkKHJlc3VsdHNfNDApCgpmYXZzdGF0cyh+cHJvcF9oaXNwYW5pYyxkYXRhPXJlc3VsdHNfNDApCgpnZl9oaXN0b2dyYW0ofnByb3BfaGlzcGFuaWMsZGF0YT1yZXN1bHRzXzQwLGJpbndpZHRoPTAuMDUpCmBgYAoKYGBge3IyfQoKYGBgCgojIyMgRXhlcmNpc2UgNwpUaGUgbWVhbiBpcyBiYXNpY2FsbHkgdGhlIHNhbWUgY29tcGFyZWQgdG8gdGhlIHBvcHVsYXRpb24gYXZlcmFnZSBkYXlzIG9mIHN0cmVuZ3RoIHRyYWluaW5nLlRoaXMgaGlzdG9ncmFtIGlzIGEgbG90IG1vcmUgc3ltbWV0cmljYWwgdGhhbiB0aGUgcG9wdWxhdGlvbiBoaXN0b2dyYW0gYW5kIGlzbid0IHNrZXdlZCBpbiBlaXRoZXIgZGlyZWN0aW9uLiAKCmBgYHtyIGNvZGUtY2h1bmstbGFiZWw3fQpmYXZzdGF0cyh+bWVhbl9zdHJlbmd0aCxkYXRhPXJlc3VsdHNfMTApCmdmX2hpc3RvZ3JhbSh+bWVhbl9zdHJlbmd0aCxkYXRhPXJlc3VsdHNfMTApCmBgYAoKIyMjIEV4ZXJjaXNlIDgKVGhlIG51bWJlciBmb3IgcmVzdWx0c18xMCBpcyAuMDQzNTggd2hpbGUgdGhlIG51bWJlciBmb3IgcmVzdWx0c180MCB3YXMgLjAxMDg5LiBUaGUgcmVzdWx0XzQwIHZhbHVlIHdhcyBhIGxvdCBsb3dlciBzaW5jZSB0aGUgc2FtcGxlIHNpemUgd2FzIG11Y2ggaGlnaGVyIHRoYW4gMTAuIFRoZSBhdmVyYWdlIHBvcHVsYXRpb24gaXMgcmVwcmVzZW50ZWQgYnkgdGhlIG1lYW4uIFRoZSBwb3B1bGF0aW9uIG9mIHRoZXNlIHR3byBudW1iZXJzIGFyZSB2ZXJ5IHNpbWlsYXIgYXQgMjUuMzYgKHJlc3VsdHNfMTApIGFuZCAyNS4zNyUgKHJlc3VsdHNfNDApLiAKYGBge3IgY29kZS1jaHVuay1sYWJlbDh9CgowLjEzNzgyNC9zcXJ0KDEwKQowLjA2ODg2MDcvc3FydCg0MCkKCmBgYAoKIyMjIEV4ZXJjaXNlIDkKVGhlIG1lYW4gdmFsdWVzIHdlcmUgcHJldHR5IHNpbWlsYXIgd2hlbiBjb21wYXJlZCB0byB0aGUgc3RhbmRhcmQgZGV2aWF0aW9uIGluIHNpemUgbWVhbmluZyBpdCBpcyB1bmJpYXNlZC4gVGhlIG1lYW5zIHdlcmUgYXQgMi41NCBhbmQgMi41NiBhbmQgdGhlIHN0YW5kYXJkIGRldmlhdGlvbiBwcmV2aW9zdWx5IHdhcyAyLjU3LiAKCmBgYHtyIGNvZGUtY2h1Y2stbGFiZWw5fQoKZmF2c3RhdHMocmVzdWx0c18xMCRzZF9zdHJlbmd0aCkKCmZhdnN0YXRzKHJlc3VsdHNfNDAkc2Rfc3RyZW5ndGgpCgoKYGBg