library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
count_each <-yrbss%>%
count(text_while_driving_30d)
count_each
strength_training_7d <-yrbss%>%
count(hours_tv_per_school_day)
physically_active_7d <-yrbss%>%
count(physically_active_7d)
yrbss
physically_active_7d <- yrbss %>%
mutate(text_ind = ifelse(physically_active_7d == "7", "yes", "no"))
hours_tv_per_sd <- yrbss %>%
mutate(text_ind = ifelse(hours_tv_per_school_day == "do not watch", "yes", "no"))
physically_active_7d
hours_tv_per_sd
data('yrbss', package='openintro')
texting_helmet <- yrbss %>%
filter(helmet_12m == "never")
texting_helmet <- texting_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
texting_helmet %>% count(text_ind)
texting_helmet
texting_helmet %>%
filter(text_ind != "") %>%
specify(response = text_ind, success = "yes") %>%
generate(type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
n = 6977 #Population
z = 1.96 #Z-Score
p <- seq(from = 0, to = 0.04, by = 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
me
## [1] 0.000000000 0.002382392 0.003352152 0.004084530 0.004692035
The margin of error is 0.004
dd <- data.frame(p = p, me = me)
ggplot(data = dd, aes(x = p, y = me)) +
geom_line() +
labs(x = "Population Proportion", y = "Margin of Error")
physically_active_7d <- physically_active_7d %>%
mutate(text_ind = ifelse(physically_active_7d == "7", "yes", "no"))
physically_active_7d
physically_active_7d %>%
filter(text_ind != "") %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
hours_tv_per_sd
hours_tv_per_sd %>%
filter(text_ind != "") %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
The CI for people who have exercised 7 days is (0.2645379 , 0.2797145)
The cI for people who have not watched tv is (0.1330294,0.1454134 )
n <- 1000
p <- seq(from = 0, to = 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
dd <- data.frame(p = p, me = me)
ggplot(data = dd, aes(x = p, y = me)) +
geom_line() +
labs(x = "Population Proportion", y = "Margin of Error")
This chart appears to create a inverted parabola centered around 50% of
the population. The maximum me is 50% of the population.
p <- 0.1
n <- 300
(p*(1-p)/n)^.5
## [1] 0.01732051
.1-(p*(1-p)/n)^.5
## [1] 0.08267949
.1+(p*(1-p)/n)^.5
## [1] 0.1173205
Center is a 0.1, spread is (0.08, 0.11)
p <- 0.1
n <- 300
(p*(1-p)/n)^.5
## [1] 0.01732051
.1-(p*(1-p)/n)^.5
## [1] 0.08267949
.1+(p*(1-p)/n)^.5
## [1] 0.1173205
As P changes, the center changes, but the spread remains the same.
As n changes the distribution of the sample! Higher ns have a tighter spread, lower ns have a lower spread.
Null-There is no difference in strength training Alt-There is a difference in strength training for students that sleep 10+ hours
sleep <- yrbss %>%
filter(school_night_hours_sleep == "10+")
strengthTraining <- yrbss %>%
mutate(text_ind = ifelse(strength_training_7d == "7", "yes", "no"))
strengthTraining %>%
filter(text_ind != "") %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
The ci range is (0.1616, 0.1743)
Type 1 error is false positives, which is a bias to reject the null hypothesis. I believe there would be a 5% chance to get a type 1 error in this set.
Margin of error is 1.96SE = 1.96 (p(1-p)/n)^0.5
so if we are aiming for a me of <1% with a ci of 95% n = 1.96^2 * 0.5*(1 − 0.5)/me^2 or n = 9,604