library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(dplyr)
library(infer)
library(metan)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## |=========================================================|
## | Multi-Environment Trial Analysis (metan) v1.17.0 |
## | Author: Tiago Olivoto |
## | Type 'citation('metan')' to know how to cite metan |
## | Type 'vignette('metan_start')' for a short tutorial |
## | Visit 'https://bit.ly/pkgmetan' for a complete tutorial |
## |=========================================================|
##
## Attaching package: 'metan'
##
## The following object is masked from 'package:forcats':
##
## as_factor
##
## The following object is masked from 'package:dplyr':
##
## recode_factor
##
## The following object is masked from 'package:tidyr':
##
## replace_na
##
## The following objects are masked from 'package:tibble':
##
## column_to_rownames, remove_rownames, rownames_to_column
#Plot CI function source from https://rdrr.io/github/OpenIntroOrg/oilabs-r-package/src/R/plot_ci.R
# Instead of loading and implementing OpenIntro Labs companion library
plot_ci <- function(lo, hi, m) {
par(mar=c(2, 1, 1, 1), mgp=c(2.7, 0.7, 0))
k <- length(lo)
ci.max <- max(rowSums(matrix(c(-1*lo,hi),ncol=2)))
xR <- m + ci.max*c(-1, 1)
yR <- c(0, 41*k/40)
plot(xR, yR, type='n', xlab='', ylab='', axes=FALSE)
abline(v=m, lty=2, col='#00000088')
axis(1, at=m, paste("mu = ",round(m,4)), cex.axis=1.15)
#axis(2)
for(i in 1:k){
x <- mean(c(hi[i],lo[i]))
ci <- c(lo[i],hi[i])
if((m < hi[i] & m > lo[i])==FALSE){
col <- "#F05133"
points(x, i, cex=1.4, col=col)
# points(x, i, pch=20, cex=1.2, col=col)
lines(ci, rep(i, 2), col=col, lwd=5)
} else{
col <- 1
points(x, i, pch=20, cex=1.2, col=col)
lines(ci, rep(i, 2), col=col)
}
}
}
set.seed(0144)
pop <- tibble(
climate_change_affects = c(rep("Yes", 62000), rep("No", 38000))
)
samp1 <- pop %>%
sample_n(60)
ggplot(samp1, aes(x = climate_change_affects)) +
geom_bar() +
labs(
x = "", y = "",
title = "Do you think climate change is affecting your local community?"
) +
coord_flip()
pop %>%
count(climate_change_affects) %>%
mutate(p = n /sum(n)) %>% t()
## [,1] [,2]
## climate_change_affects "No" "Yes"
## n "38000" "62000"
## p "0.38" "0.62"
In this sample, 61.67% of adults believe that climate change affects their community.
I would expect another student’s sample size to be similar to mine, but not exactly the same. These are random samples, so there should be some degree of variation in the data set, but it should be remarkably close.
We used the phrase 95% confident to indicate that we are 95% sure that the value falls within a range.
My confidence interval has a 95% chance to contain the true population portion of UUS adults who think climate change affects their local community. And if I was working in a lab, my neighbors interval would have the same 95% chance to capture it. (assuming they followed the same code as above)
I would expect 95% of the ci intervals to capture the true population mean. Because all the intervals have a 95% ci, I would expect 95% of all of them to capture the true population mean.
n <- 50
upper_ci <- rep(NA, n)
lower_ci <- rep(NA, n)
for(i in 1:n){
test <- sample_n(pop, 60)
ci <- test %>%
specify(response = climate_change_affects, success = "Yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.95)
upper_ci[i] <- ci$upper_ci
lower_ci[i] <- ci$lower_ci
}
plot_ci(lower_ci, upper_ci, 0.62)
96% of my sample (48/50) contains the true population proportion. This proportion is approximately equal to the Ci, within a margin of error.
I choose a 1% ci! This would have a significantly smaller ci than the 95% as there only needs to be a 1% chance that the value is within the range.
n <- 50
upper_ci <- rep(NA, n)
lower_ci <- rep(NA, n)
for(i in 1:n){
test <- sample_n(pop, 60)
ci <- test %>%
specify(response = climate_change_affects, success = "Yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.01)
upper_ci[i] <- ci$upper_ci
lower_ci[i] <- ci$lower_ci
}
plot_ci(lower_ci, upper_ci, 0.62)
None of the cis include the true population proportion.
n <- 50
upper_ci <- rep(NA, n)
lower_ci <- rep(NA, n)
for(i in 1:n){
test <- sample_n(pop, 60)
ci <- test %>%
specify(response = climate_change_affects, success = "Yes") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.25)
upper_ci[i] <- ci$upper_ci
lower_ci[i] <- ci$lower_ci
}
plot_ci(lower_ci, upper_ci, 0.62)
12/50 contain the true mean! The spread of the intervals is a lot upper_cier, but less of them contain the true mean. As the number of samples decrease, the variation of the intervals decreases
n <- 50
upper_ci <- rep(NA, n)
lower_ci <- rep(NA, n)
for(i in 1:n){
test <- sample_n(pop, 60)
ci <- test %>%
specify(response = climate_change_affects, success = "Yes") %>%
generate(reps = 2000, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = 0.01)
upper_ci[i] <- ci$upper_ci
lower_ci[i] <- ci$lower_ci
}
plot_ci(lower_ci, upper_ci, 0.62)
Since the seed has been set the number of reps do not matter.