This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
library(ggplot2)
library(infer)
## Warning: package 'infer' was built under R version 4.1.2
yrbss
## # A tibble: 13,583 x 13
## age gender grade hispanic race height weight helmet_12m text_while_driv~
## <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 14 female 9 not Black ~ NA NA never 0
## 2 14 female 9 not Black ~ NA NA never <NA>
## 3 15 female 9 hispanic Native~ 1.73 84.4 never 30
## 4 15 female 9 not Black ~ 1.6 55.8 never 0
## 5 15 female 9 not Black ~ 1.5 46.7 did not r~ did not drive
## 6 15 female 9 not Black ~ 1.57 67.1 did not r~ did not drive
## 7 15 female 9 not Black ~ 1.65 132. did not r~ <NA>
## 8 14 male 9 not Black ~ 1.88 71.2 never <NA>
## 9 15 male 9 not Black ~ 1.75 63.5 never <NA>
## 10 15 male 10 not Black ~ 1.37 97.1 did not r~ <NA>
## # ... with 13,573 more rows, and 4 more variables: physically_active_7d <int>,
## # hours_tv_per_school_day <chr>, strength_training_7d <int>,
## # school_night_hours_sleep <chr>
count_yrbss <- count(yrbss)
count_yrbss
## # A tibble: 1 x 1
## n
## <int>
## 1 13583
count_each <-yrbss%>%
count(text_while_driving_30d)
count_each
## # A tibble: 9 x 2
## text_while_driving_30d n
## <chr> <int>
## 1 0 4792
## 2 1-2 925
## 3 10-19 373
## 4 20-29 298
## 5 3-5 493
## 6 30 827
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
no_helmet <- yrbss %>%
filter(helmet_12m == "never")
no_helmet <- no_helmet %>%
mutate(text_ind = ifelse(text_while_driving_30d == "30", "yes", "no"))
no_helmet%>%
count(text_ind)
## # A tibble: 3 x 2
## text_ind n
## <chr> <int>
## 1 no 6040
## 2 yes 463
## 3 <NA> 474
no_helmet %>%
specify(response = text_ind, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
## Warning: Removed 474 rows containing missing values.
## Warning: You have given `type = "bootstrap"`, but `type` is expected to be
## `"draw"`. This workflow is untested and the results may not mean what you think
## they mean.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0650 0.0774
n = 6977
z = 1.96
p <- seq(from = 0, to = 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
me = 0.004
#calculate confidence intervals for two other categorical variables
good_sleep <- yrbss%>%
mutate(slept_well = ifelse(school_night_hours_sleep > 5, "yes", "no"))
good_sleep%>%
specify(response = slept_well, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
## Warning: Removed 1248 rows containing missing values.
## Warning: You have given `type = "bootstrap"`, but `type` is expected to be
## `"draw"`. This workflow is untested and the results may not mean what you think
## they mean.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.769 0.783
no_tv <- yrbss %>%
mutate(did_not_watch_tv = ifelse(hours_tv_per_school_day == "do not watch", "yes", "no") )
no_tv %>%
specify(response = did_not_watch_tv, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
## Warning: Removed 338 rows containing missing values.
## Warning: You have given `type = "bootstrap"`, but `type` is expected to be
## `"draw"`. This workflow is untested and the results may not mean what you think
## they mean.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.133 0.145
n <- 1000
p <- seq(from = 0, to = 1, by = 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
dd <- data.frame(p = p, me = me)
ggplot(data = dd, aes(x = p, y = me)) +
geom_line() +
labs(x = "Population Proportion", y = "Margin of Error")
# Exercise 5:
Describe the relationship between p and me. Include the margin of error vs. population proportion plot you constructed in your answer. For a given sample size, for which value of p is margin of error maximized?
# The margin of error increases as the population proportion increases. And after 50% it begins to decrease.
# The distribution is normal. The center is at 0.1 and the spread is between 0.04 and 0.17
p <- 0.1
n <- 300
n*p
## [1] 30
n*(1-p)
## [1] 270
#Keep n constant and change p. How does the shape, center, and spread of the sampling distribution vary as p changes. #You might want to adjust min and max for the x-axis for a better view of the distribution.
# The distribution seems to be normal. The center is at 0.1 and the spread is between 0.05 and 0.16
p <- 0.5
n <- 300
n*p
## [1] 150
n*(1-p)
## [1] 150
#Now also change n. How does n appear to affect the distribution of p^?
# When n decrease the spread increase
p <- 0.5
n <- 200
n*p
## [1] 100
n*(1-p)
## [1] 100
p <- 0.5
n <- 200
n*p
## [1] 100
n*(1-p)
## [1] 100
Null Hypothesis: There is no difference in strength training between students that sleep more than 10+ hours and those who don’t.
Alternative: There is a difference in strength training between students that sleep more than 10+ hours and those who don’t.
There is a 95% confident that the student proportion of those students that sleep more than 10+ hours are between 0.221 and 0.321.
good_sleep <- yrbss %>%
filter(school_night_hours_sleep == "10+")
good_sleep <- good_sleep %>%
mutate(strength = ifelse(strength_training_7d == "7", "yes", "no"))
good_sleep %>%
specify(response = strength, success = "yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
## Warning: Removed 4 rows containing missing values.
## Warning: You have given `type = "bootstrap"`, but `type` is expected to be
## `"draw"`. This workflow is untested and the results may not mean what you think
## they mean.
## # A tibble: 1 x 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.221 0.317
# There would be a 5% chance of detecting a change. A type 1 error is a false positive
# ME = 1.96 * SE=1.96 * sqrtp(1−p)/n
# n = (0.3)2/(0.01/1.96)2
# n= 3457