library(tidyverse)
library(openintro)
library(infer)The largest group in this category is 0 and ‘did not drive’.
yrbss %>%
count(text_while_driving_30d) ## # A tibble: 9 x 2
## text_while_driving_30d n
## * <chr> <int>
## 1 0 4792
## 2 1-2 925
## 3 10-19 373
## 4 20-29 298
## 5 3-5 493
## 6 30 827
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
Around 7.1% of teens texted while driving 30 days straight without wearing a helmet.
no_helmet <- yrbss %>%
filter(helmet_12m == "never")
no_helmet <- no_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
no_helmet %>%
drop_na(text_ind) %>%
count(text_ind)## # A tibble: 2 x 2
## text_ind n
## * <chr> <int>
## 1 no 6040
## 2 yes 463
no_helmet %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)## Warning: Removed 474 rows containing missing values.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0654 0.0778
no_helmet <- no_helmet %>%
drop_na(text_ind)
n <- count(no_helmet)
p <- count(no_helmet %>% filter(text_ind == "yes")) / n
SE <- sqrt( (p*(1-p)) / n )
ME <- 1.96 * SE
print(ME)## n
## 1 0.006250207
The confidence interval we calculates is between 6.1% and 7.8%.This means we are 95% confident the true proportion would fall between this range.
no_drive <- yrbss %>%
filter(helmet_12m == "did not ride")
no_drive <- no_drive %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
no_drive %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)## Warning: Removed 324 rows containing missing values.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0627 0.0774
rarely_helmet <- yrbss %>%
filter(helmet_12m == "rarely")
rarely_helmet <- rarely_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
rarely_helmet %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)## Warning: Removed 43 rows containing missing values.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0284 0.0597
As the population proportion approaches 0.5 the Margin of Error increases. Meaning the more even the proportion the higher the Margin of Error is.
n <- 1000
p <- seq(from = 0, to = 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
dd <- data.frame(p = p, me = me)
ggplot(data = dd, aes(x = p, y = me)) +
geom_line() +
labs(x = "Population Proportion", y = "Margin of Error")The mean is slightly off from the population proportion p = 0.1 being 0.09. The distribution is still normal and the standard deviation is 0.02.
set.seed(10000)
sample1 <- data.frame(samples = sample(c("y", "n"), 300, replace = TRUE, prob = c(0.1, 0.9))) %>%
generate(reps = 15000, type = "bootstrap") %>%
count(samples) %>%
mutate(p_hat = n /sum(n)) %>%
filter(samples == 'y')
hist(sample1$p_hat)library(psych)##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(sample1$p_hat)## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.09 0.02 0.09 0.09 0.01 0.03 0.15 0.12 0.16 0.05 0
###Exercise 7
Increasing the proportion in this case increases the standard deviation. In the case where p = 0.1 the standard deviation of 0.02 and increasing p to 0.3 and 0.5 gave us a value of 0.03.
sample2 <- data.frame(samples = sample(c("y", "n"), 300, replace = TRUE, prob = c(0.3, 0.7))) %>%
generate(reps = 15000, type = "bootstrap") %>%
count(samples) %>%
mutate(p_hat = n /sum(n)) %>%
filter(samples == 'y')
hist(sample2$p_hat)describe(sample2$p_hat)## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.28 0.03 0.28 0.28 0.02 0.18 0.39 0.21 0.07 0.05 0
sample3 <- data.frame(samples = sample(c("y", "n"), 300, replace = TRUE, prob = c(0.5, 0.5))) %>%
generate(reps = 15000, type = "bootstrap") %>%
count(samples) %>%
mutate(p_hat = n /sum(n)) %>%
filter(samples == 'y')
hist(sample3$p_hat)describe(sample3$p_hat)## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.52 0.03 0.52 0.52 0.03 0.41 0.63 0.22 0.01 0 0
sample4 <- data.frame(samples = sample(c("y", "n"), 2000, replace = TRUE, prob = c(0.1, 0.9))) %>%
generate(reps = 15000, type = "bootstrap") %>%
count(samples) %>%
mutate(p_hat = n /sum(n)) %>%
filter(samples == 'y')
hist(sample4$p_hat)My null hypothesis is the proportions of students that sleep 10+ hours and weight train will match the proportion of those who do not.
school_sleep <- yrbss %>%
mutate(ten_plus_sleep = ifelse(school_night_hours_sleep == "10+", "yes", "no")) %>%
mutate(s_training = ifelse(strength_training_7d == "7", "yes", "no")) %>%
drop_na(school_night_hours_sleep) %>%
drop_na(strength_training_7d)
school_sleep %>%
count(ten_plus_sleep, s_training)## # A tibble: 4 x 3
## ten_plus_sleep s_training n
## <chr> <chr> <int>
## 1 no no 9949
## 2 no yes 1958
## 3 yes no 228
## 4 yes yes 84
p_0 <- 1958 / (1958 + 9949)
p <- 84 / (228 + 84)
n <- 228 + 84
SE <- sqrt( (p_0*(1-p_0)) / n )
Z <- (p - p_0) / SE
1 - pnorm(Z)## [1] 2.965257e-07
With this result I reject the null hypothesis and accept the alternative hypothesis that students that sleep 10+ hours are more likely to weight train.
In this case the chace this is a coincidence is 3*10^-7% chance. Very unlikely.
Assuming we want at the very least a 1% margin of error I am assuming our proportion is 0.5 since that would produce the largest ME. 1,900 would be the minimum amount of people we would need to survery in order to reach that value.
(0.05*(1-0.05)) / ((0.01/2)^2)## [1] 1900