Sampling basics
Sampling and point estimates
- Population
- The complete dataset
- Typically don’t know what the whole population is
- Population parameter - A calculation made on the
population dataset
- Sample
- The subset of data you calculate on
- Point estimate (sample statistic)
- A calculation made on the sample dataset
# Sampling and point estimates
pts_vs_flavor_pop <- coffee_ratings %>%
select(total_cup_points, flavor)
dim(pts_vs_flavor_pop)
## [1] 1339 2
pts_vs_flavor_samp <- coffee_ratings %>%
select(total_cup_points, flavor) %>%
slice_sample(n = 10) # slice_sample() for data frames
dim(pts_vs_flavor_samp)
## [1] 10 2
cup_points_samp <- sample(coffee_ratings$total_cup_points, size = 10) # sample() for vectors
cup_points_samp
## [1] 75.58 81.50 79.00 83.50 86.42 87.25 84.08 81.58 81.58 84.67
# Population parameter
mean(pts_vs_flavor_pop$total_cup_points)
## [1] 82.08985
pts_vs_flavor_pop %>%
summarise(mean_flavor = mean(flavor))
# Point estimate
mean(cup_points_samp)
## [1] 82.516
pts_vs_flavor_samp %>%
summarise(mean_flavor = mean(flavor))
# Simple sampling with dplyr
spotify_sample <- spotify_population %>%
slice_sample(n = 1000)
mean_dur_pop <- spotify_population %>% summarize(mean_dur = mean(duration_minutes))
mean_dur_samp <- spotify_sample %>% summarize(mean_dur = mean(duration_minutes))
mean_dur_pop
mean_dur_samp
# Simple sampling with base-R
loudness_pop <- spotify_population$loudness
loudness_samp <- sample(loudness_pop, size = 100)
sd_loudness_pop <- sd(loudness_pop)
sd_loudness_samp <- sd(loudness_samp)
sd_loudness_pop
## [1] 4.524076
sd_loudness_samp
## [1] 4.190513
Convenience sampling
- Convenience sampling
- Sample not representative of population, causing sample bias
- Collecting data by the easiest method is called convenience
sampling
# Convenience sampling coffee ratings
coffee_ratings %>%
summarize(mean_cup_points = mean(total_cup_points))
coffee_ratings_first10 <- coffee_ratings %>%
slice_head(n = 10)
coffee_ratings_first10 %>%
summarize(mean_cup_points = mean(total_cup_points))
# Visualizing selection bias
g1 <- coffee_ratings %>%
ggplot(aes(x = total_cup_points)) +
geom_histogram(binwidth = 2) +
labs(title = "Population")
g2 <- coffee_ratings_first10 %>%
ggplot(aes(x = total_cup_points)) +
geom_histogram(binwidth = 2) +
xlim(59, 91) +
labs(title = "Convenience sampling")
g3 <- coffee_ratings %>%
slice_sample(n = 10) %>%
ggplot(aes(x = total_cup_points)) +
geom_histogram(binwidth = 2) +
xlim(59, 91) +
labs(title = "Random sampling")
grid.arrange(g1, g2, g3, nrow = 1)

Random number generation
- True random numbers
- Generated from physical processes, like flipping coins
- Hotbits uses radioactive decay
- RANDOM.ORG uses atmospheric noise
- Available in R via the random package
- True randomness is slow and expensive
- Pseudo-random number generation
- Next “random” number calculated from previous “random” number
- The first “random” number calculated from a seed
- If you start from the same seed value, all future random numbers
will be the same
- Random number generating functions
- Generating pseudo-random numbers from a set of values
- Functions generate random numbers that follow a statistical
distribution
- Name beginning with “r”
- The first argument is the number of numbers to generate
- Other arguments are distribution-specific
| rbeta |
Beta |
rgeom |
Geometric |
rsignrank |
Wilcoxon signed rank |
| rbinom |
Binomial |
rhyper |
Hypergeometric |
rt |
t |
| rcauchy |
Cauchy |
rlnorm |
Lognormal |
runif |
Uniform |
| rchisq |
Chi-squared |
rlogis |
Logistic |
rweibull |
Weibull |
| rexp |
Exponential |
rnbinom |
Negative binomial |
rwilcox |
Wilcoxon rank sum |
| rf |
F |
rnorm |
Normal |
|
|
| rgamma |
Gamma |
rpois |
Poisson |
|
|
# Visualizing random numbers
randoms <- data.frame(beta = rbeta(5000, shape1 = 2, shape2 = 2))
ggplot(randoms, aes(beta)) +
geom_histogram(binwidth = 0.1)

# Random numbers seeds
set.seed(20000229)
rnorm(5)
## [1] -1.6538464 -0.4028384 -0.1654293 -0.0733821 0.5171339
rnorm(5)
## [1] 1.9078109 0.3791112 -1.4986993 1.6252191 0.6933334
set.seed(20000229)
rnorm(5)
## [1] -1.6538464 -0.4028384 -0.1654293 -0.0733821 0.5171339
rnorm(5)
## [1] 1.9078109 0.3791112 -1.4986993 1.6252191 0.6933334
set.seed(20041004)
rnorm(5)
## [1] -0.65474142 -0.78542098 -0.01516189 0.15143890 0.52847192
rnorm(5)
## [1] 0.7475302 0.9743212 0.1738200 -0.7809440 -0.9304514
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
# Generating random numbers
n_numbers = 5000
randoms <- data.frame(
uniform = runif(n_numbers, min = -3, max = 3),
normal = rnorm(n_numbers, mean = 5, sd = 2)
)
gunif <- ggplot(randoms, aes(uniform)) +
geom_histogram(binwidth = 0.25)
gnorm <- ggplot(randoms, aes(normal)) +
geom_histogram(binwidth = 0.5)
grid.arrange(gunif, gnorm, nrow = 1)

Random sampling method
Simple random sampling
# Simple random sampling
set.seed(19000113)
coffee_ratings %>% slice_sample(n = 5)
set.seed(1)
attrition_samp <- attrition_pop %>%
rowid_to_column() %>%
slice_sample(n = 200)
attrition_samp
Systematic sampling
- Systematic random sampling - Samples the population
at regular intervals
- Only safe if a plot has no pattern in the scatter plot. That is, it
just looks like noise
- Shuffling rows + systematic sampling is the same as simple random
sampling
# Systematic random sampling
# Adding a row ID column
coffee_ratings_rowid <- coffee_ratings %>%
rowid_to_column()
# Determining how big the interval should be between each row to include in the sample
sample_size <- 5
pop_size <- nrow(coffee_ratings_rowid)
interval <- pop_size %/% sample_size
# Filtering for every two hundred and sixty seventh row
row_indexes <- seq_len(sample_size) * interval
coffee_ratings_rowid %>% slice(row_indexes)
g1 <- coffee_ratings_rowid %>%
ggplot(aes(x = rowid, y = aftertaste)) +
geom_point() +
geom_smooth() +
ylim(6, 9)
# Making systematic sampling safe
# Randomize the row order before sampling
shuffled <- coffee_ratings_rowid %>%
slice_sample(prop = 1) %>%
select(-rowid) %>%
rowid_to_column()
g2 <- shuffled %>%
ggplot(aes(x = rowid, y = aftertaste)) +
geom_point() +
geom_smooth() +
ylim(6, 9)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).

sample_size <- 200
pop_size <- nrow(attrition_pop)
interval <- pop_size %/% sample_size
row_indexes <- seq_len(sample_size) * interval
attrition_sys_samp <- attrition_pop %>%
rowid_to_column() %>%
slice(row_indexes)
attrition_sys_samp
attrition_pop_id <- attrition_pop %>%
rowid_to_column()
g1 <- ggplot(attrition_pop_id, aes(rowid, YearsAtCompany)) +
geom_point() +
geom_smooth()
attrition_shuffled <- attrition_pop %>%
slice_sample(prop = 1) %>%
rowid_to_column()
g2 <- ggplot(attrition_shuffled, aes(rowid, YearsAtCompany)) +
geom_point() +
geom_smooth()
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Stratified sampling
- Stratified sampling - A technique that allows to
sample a population that contains subgroups
# Stratified sampling
# Coffees by country
top_counts <- coffee_ratings %>%
count(country_of_origin, sort = TRUE) %>%
head()
top_counts
# Filtering for 6 countries
top_counted_countries <- c("Mexico", "Colombia", "Guatemala", "Brazil", "Taiwan", "United States (Hawaii)")
coffee_ratings_top <- coffee_ratings %>%
filter(country_of_origin %in% top_counted_countries)
coffee_ratings_top <- coffee_ratings %>%
semi_join(top_counts)
## Joining with `by = join_by(country_of_origin)`
# Counts of a simple random sample
coffee_ratings_samp <- coffee_ratings_top %>%
slice_sample(prop = 0.1)
# Population
coffee_ratings_top %>%
count(country_of_origin, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
# 10% sample - Disproportion
coffee_ratings_samp %>%
count(country_of_origin, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
# Proportional stratified sampling
coffee_ratings_strat <- coffee_ratings_top %>%
group_by(country_of_origin) %>%
slice_sample(prop = 0.1) %>%
ungroup()
coffee_ratings_strat %>%
count(country_of_origin, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
# Equal counts stratified sampling
coffee_ratings_eq <- coffee_ratings_top %>%
group_by(country_of_origin) %>%
slice_sample(n = 15) %>%
ungroup()
coffee_ratings_eq %>%
count(country_of_origin, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
# Weighted random sampling
coffee_ratings_weight <- coffee_ratings_top %>%
mutate(weight = ifelse(country_of_origin == "Taiwan", 2, 1)) %>%
slice_sample(prop = 0.1, weight_by = weight)
coffee_ratings_weight %>%
count(country_of_origin, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
# Stratified sampling
# Proportional stratified sampling
attrition_pop %>%
count(Education, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
attrition_strat <- attrition_pop %>%
group_by(Education) %>%
slice_sample(prop = 0.4) %>%
ungroup()
education_counts_strat <- attrition_strat %>%
count(Education, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
education_counts_strat
# Equal counts stratified sampling
attrition_eq <- attrition_pop %>%
group_by(Education) %>%
slice_sample(n = 30) %>%
ungroup()
education_counts_eq <- attrition_eq %>%
count(Education, sort = TRUE) %>%
mutate(percent = 100 * n / sum(n))
education_counts_eq
# Weighted sampling
g1 <- ggplot(attrition_pop, aes(YearsAtCompany)) +
geom_histogram(binwidth = 1)
# Sample 400 employees weighted by YearsAtCompany
attrition_weight <- slice_sample(attrition_pop, n = 400, weight_by = YearsAtCompany)
g2 <- ggplot(attrition_weight, aes(YearsAtCompany)) +
geom_histogram(binwidth = 1)
grid.arrange(g1, g2, nrow = 1)

mean(attrition_pop$YearsAtCompany)
## [1] 7.008163
mean(attrition_weight$YearsAtCompany) # The weighted simple random sample is biased
## [1] 10.76
Cluster sampling
- Stratified vs Cluster sampling
- Stratified sampling
- Split the population into subgroups
- Use simple random sampling on every subgroup
- Need to collect data from every subgroup, can be expensive
- Cheaper alternative - Cluster sampling
- Cluster sampling
- Use simple random sampling to pick some subgroups
- Use simple random sampling on only those subgroups
- Multistage sampling
- Cluster sampling is a type of multistage sampling
- You can have > 2 stages
- Countrywide surveys may sample states, counties, cities, and
neighborhoods
# Varieties of coffee
varieties_pop <- unique(coffee_ratings$variety)
# Stage 1: sampling for subgroups
varieties_samp <- sample(varieties_pop, size = 3)
# Stage 2: sampling each group
coffee_ratings %>%
filter(variety %in% varieties_samp) %>%
group_by(variety) %>%
slice_sample(n = 5) %>%
ungroup()
job_roles_pop <- unique(attrition_pop$JobRole)
job_roles_samp <- sample(job_roles_pop, size = 4)
attrition_filtered <- filter(attrition_pop, JobRole %in% job_roles_samp)
attrition_clus <- attrition_filtered %>%
group_by(JobRole) %>%
slice_sample(n = 10)
attrition_clus
Comparing sampling methods
# Comparing sampling methods
# Review of sampling techniques
# Setup
top_counted_countries <- c("Mexico", "Colombia", "Guatemala", "Brazil", "Taiwan", "United States (Hawaii)")
coffee_ratings_top <- coffee_ratings %>%
filter(country_of_origin %in% top_counted_countries)
# Simple random sampling
coffee_ratings_srs <- coffee_ratings_top %>%
slice_sample(prop = 1/3)
# Stratified sampling
coffee_ratings_strat <- coffee_ratings_top %>%
group_by(country_of_origin) %>%
slice_sample(prop = 1/3) %>%
ungroup()
# Cluster sampling
top_countries_samp <- sample(top_counted_countries, size = 2)
coffee_ratings_clust <- coffee_ratings_top %>%
filter(country_of_origin %in% top_countries_samp) %>%
group_by(country_of_origin) %>%
slice_sample(n = nrow(coffee_ratings_top) %/% 6) %>%
ungroup()
# Calculating mean cup points
# The simple and stratified sample means are really close to the population mean
# Cluster sampling isn't quite as close, but that's typical
# Cluster sampling is designed to give you an answer that's almost as good, while using less data
# Population mean
coffee_ratings_top %>%
summarize(mean_points = mean(total_cup_points))
# Point estimates of the mean - sample mean
# Simple random sample
coffee_ratings_srs %>%
summarize(mean_points = mean(total_cup_points))
# Stratified sample
coffee_ratings_strat %>%
summarize(mean_points = mean(total_cup_points))
# Cluster sample
coffee_ratings_clust %>%
summarize(mean_points = mean(total_cup_points))
# Mean cup points by country
# Each of the sample means is pretty close to the population mean
# The same is true of the sample means from the stratified technique
# With cluster sampling, while the sample means are pretty close to the population mean, the obvious limitation is that you only get values for the two countries that were included in the sample
# If the mean cup points for each country is an important metric in your analysis, cluster sampling would be a bad idea
# Population
coffee_ratings_top %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# Simple random sample
coffee_ratings_srs %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# Stratified sample
coffee_ratings_strat %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# Cluster sample
coffee_ratings_clust %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
attrition_srs <- attrition_pop %>%
slice_sample(prop = 1/4)
attrition_strat <- attrition_pop %>%
group_by(RelationshipSatisfaction) %>%
slice_sample(prop = 1/4) %>%
ungroup
satisfaction_unique <- unique(attrition_pop$RelationshipSatisfaction)
satisfaction_samp <- sample(satisfaction_unique, size = 2)
attrition_clust <- attrition_pop %>%
filter(RelationshipSatisfaction %in% satisfaction_samp) %>%
group_by(RelationshipSatisfaction) %>%
slice_sample(n = nrow(attrition_pop) %/% 4) %>%
ungroup
# Population
mean_attrition_pop <- attrition_pop %>%
group_by(RelationshipSatisfaction) %>%
summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_pop
# Simple random sample
mean_attrition_srs <- attrition_srs %>%
group_by(RelationshipSatisfaction) %>%
summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_srs
# Stratified sample
mean_attrition_strat <- attrition_strat %>%
group_by(RelationshipSatisfaction) %>%
summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_strat
# Cluster sample
mean_attrition_clust <- attrition_clust %>%
group_by(RelationshipSatisfaction) %>%
summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_clust
Sampling distribution
Relative error of point estimates
- Sample size - The number of observations, that is,
the number of rows, in the sample
- In general, larger sample sizes will give us more accurate results.
The relative error decreases as the sample size increases
# Relative error vs. sample size
relative_errors <- function(sample_size){
# Population parameter
population_mean <- coffee_ratings %>%
summarize(mean_points = mean(total_cup_points)) %>%
pull(mean_points)
# Point estimate
sample_mean <- coffee_ratings %>%
slice_sample(n = sample_size) %>%
summarize(mean_points = mean(total_cup_points)) %>%
pull(mean_points)
# Relative error as a percentage
relative_error = 100 * abs(population_mean - sample_mean) / population_mean
return(relative_error)
}
sample_size = c(1:1500)
errors <- data.frame(sample_size = sample_size, relative_error = sapply(sample_size, relative_errors))
# The black line is really noisy, particularly with small sample sizes
# The downward slope in the trend line is quite steep to begin with. Further to the right in the plot, the slope is less steep
# At the far right of the plot, where the sample size is the whole population, the error decreases to zero
ggplot(errors, aes(sample_size, relative_error)) +
geom_line() +
geom_smooth(method = "loess")
## `geom_smooth()` using formula = 'y ~ x'

# Calculating relative errors
mean_attrition_pop <- mean(attrition_pop$Attrition == "Yes")
attrition_srs10 <- slice_sample(attrition_pop, n = 10)
mean_attrition_srs10 <- mean(attrition_srs10$Attrition == "Yes")
rel_error_pct10 <- 100 * abs(mean_attrition_pop - mean_attrition_srs10) / mean_attrition_pop
rel_error_pct10
## [1] 24.05063
attrition_srs100 <- slice_sample(attrition_pop, n = 100)
mean_attrition_srs100 <- mean(attrition_srs100$Attrition == "Yes")
rel_error_pct100 <- 100 * abs(mean_attrition_pop - mean_attrition_srs100) / mean_attrition_pop
rel_error_pct100
## [1] 30.25316
Sampling distribution
- Sampling distribution - A distribution of several
replicates of point estimates
- When you calculate a point estimate such as a sample mean, the value
you calculate depends on the rows that were included in the sample. That
means that there is some randomness in the answer. In order to quantify
the variation caused by this randomness, you can create many samples and
calculate the sample mean (or other statistic) for each sample
- Decreasing the sample size, the range of the results is broader
- Increasing the sample size results in a narrower range
- Increasing the number of replicates does not affect the relative
error of the sample means, it does result in a more consistent shape to
the distribution
# Sampling distribution
mean_cup_points_5 <- replicate(
n = 1000,
expr = coffee_ratings %>%
slice_sample(n = 5) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_5 <- tibble(sample_mean = mean_cup_points_5)
mean_cup_points_30 <- replicate(
n = 1000,
expr = coffee_ratings %>%
slice_sample(n = 30) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_30 <- tibble(sample_mean = mean_cup_points_30)
mean_cup_points_150 <- replicate(
n = 1000,
expr = coffee_ratings %>%
slice_sample(n = 150) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_150 <- tibble(sample_mean = mean_cup_points_150)
# Distribution of sample means
g1 <- ggplot(sample_means_5, aes(sample_mean)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Sample Size 5")
g2 <- ggplot(sample_means_30, aes(sample_mean)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Sample Size 30")
g3 <- ggplot(sample_means_150, aes(sample_mean)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Sample Size 150")
grid.arrange(g1, g2, g3, nrow = 1)

# Replicating samples
mean_attritions_rep_100 <- replicate(
n = 100,
attrition_pop %>%
slice_sample(n = 20) %>%
summarize(mean_attrition = mean(Attrition == "Yes")) %>%
pull(mean_attrition)
)
sample_means_rep_100 <- tibble(sample_mean = mean_attritions_rep_100)
mean_attritions_rep_1000 <- replicate(
n = 1000,
attrition_pop %>%
slice_sample(n = 20) %>%
summarize(mean_attrition = mean(Attrition == "Yes")) %>%
pull(mean_attrition)
)
sample_means_rep_1000 <- tibble(sample_mean = mean_attritions_rep_1000)
g1 <- ggplot(sample_means_rep_100, aes(sample_mean)) +
geom_histogram(binwidth = 0.05)
g2 <- ggplot(sample_means_rep_1000, aes(sample_mean)) +
geom_histogram(binwidth = 0.05)
grid.arrange(g1, g2, nrow = 1)

Approximate vs. Actual sampling distributions
- Exact sampling distribution - Calculate exactly, rather than using
an approximation
- Sampling distribution - The distribution of a sample statistic
# Exact sampling distribution
dice <- expand_grid(
die1 = 1:6,
die2 = 1:6,
die3 = 1:6,
die4 = 1:6
) %>%
mutate(mean_roll = (die1 + die2 + die3 + die4) / 4)
g1 <- ggplot(dice, aes(factor(mean_roll))) +
geom_bar()
# Approximate sampling distribution
# The number of outcomes increases fast
# Need to rely on approximations
sample_means_1000 <- replicate(
n = 1000,
expr = {
four_rolls <- sample(1:6, size = 4, replace = TRUE)
mean(four_rolls)
}
)
sample_means <- tibble(sample_mean = sample_means_1000)
g2 <- ggplot(sample_means, aes(factor(sample_mean))) +
geom_bar()
grid.arrange(g1, g2, nrow = 1)

# Exact sampling distribution
dice <- expand_grid(
die1 = 1:8,
die2 = 1:8,
die3 = 1:8,
die4 = 1:8,
die5 = 1:8
) %>%
mutate(mean_roll = (die1 + die2 + die3 + die4 + die5) / 5)
g1 <- ggplot(dice, aes(factor(mean_roll))) +
geom_bar()
# Approximate sampling distribution
sample_means_1000 <- replicate(
n = 1000,
expr = {
five_rolls <- sample(1:8, size = 5, replace = TRUE)
mean(five_rolls)
}
)
sample_means <- tibble(sample_mean = sample_means_1000)
g2 <- ggplot(sample_means, aes(factor(sample_mean))) +
geom_bar()
grid.arrange(g1, g2, nrow = 1)

Consequences of the central limit theorem
- Averages of independent samples have approximately normal
distributions
- As the sample size increases,
- The distribution of the averages gets closer to being normally
distributed
- The width of the sampling distribution gets narrower
- One other consequence of the central limit theorem is that if you
divide the population mean by the square root of the sample size, you
get an estimate of the standard deviation of the sampling
distribution
# Sampling distribution
mean_cup_points_5 <- replicate(
n = 5000,
expr = coffee_ratings %>%
slice_sample(n = 5) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_5 <- tibble(sample_mean = mean_cup_points_5)
mean_cup_points_20 <- replicate(
n = 5000,
expr = coffee_ratings %>%
slice_sample(n = 20) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_20 <- tibble(sample_mean = mean_cup_points_20)
mean_cup_points_80 <- replicate(
n = 5000,
expr = coffee_ratings %>%
slice_sample(n = 80) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_80 <- tibble(sample_mean = mean_cup_points_80)
mean_cup_points_320 <- replicate(
n = 5000,
expr = coffee_ratings %>%
slice_sample(n = 320) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points)
)
sample_means_320 <- tibble(sample_mean = mean_cup_points_320)
# Distribution of sample means
g1 <- ggplot(sample_means_5, aes(sample_mean)) +
geom_histogram() +
labs(title = "Sample Size 5")
g2 <- ggplot(sample_means_20, aes(sample_mean)) +
geom_histogram() +
labs(title = "Sample Size 20")
g3 <- ggplot(sample_means_80, aes(sample_mean)) +
geom_histogram() +
labs(title = "Sample Size 80")
g4 <- ggplot(sample_means_320, aes(sample_mean)) +
geom_histogram() +
labs(title = "Sample Size 320")
grid.arrange(g1, g2, g3, g4, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Population & sampling distribution means
data.frame(
sample_size = c("Population", 5, 20, 80, 320),
mean_sample_mean = c(
mean(coffee_ratings$total_cup_points),
mean(mean_cup_points_5),
mean(mean_cup_points_20),
mean(mean_cup_points_80),
mean(mean_cup_points_320)
)
)
# Population & sampling distribution standard deviations
data.frame(
sample_size = c("Population", 5, 20, 80, 320),
sd_sample_mean = c(
sd(coffee_ratings$total_cup_points),
sd(mean_cup_points_5),
sd(mean_cup_points_20),
sd(mean_cup_points_80),
sd(mean_cup_points_320)
)
)
# Population & sampling distribution means
mean_attritions_5 <- replicate(
n = 1000,
attrition_pop %>%
slice_sample(n = 5) %>%
summarise(mean_attrition = mean(Attrition == "Yes")) %>%
pull(mean_attrition)
)
sampling_distribution_5 <- tibble(
replicate = 1:1000,
mean_attrition = mean_attritions_5
)
mean_attritions_50 <- replicate(
n = 1000,
attrition_pop %>%
slice_sample(n = 50) %>%
summarise(mean_attrition = mean(Attrition == "Yes")) %>%
pull(mean_attrition)
)
sampling_distribution_50 <- tibble(
replicate = 1:1000,
mean_attrition = mean_attritions_50
)
mean_attritions_500 <- replicate(
n = 1000,
attrition_pop %>%
slice_sample(n = 500) %>%
summarise(mean_attrition = mean(Attrition == "Yes")) %>%
pull(mean_attrition)
)
sampling_distribution_500 <- tibble(
replicate = 1:1000,
mean_attrition = mean_attritions_500
)
data.frame(
sample_size = c("Population", 5, 50, 500),
mean_mean_attrition = c(
mean(attrition_pop$Attrition == "Yes"),
mean(sampling_distribution_5$mean_attrition),
mean(sampling_distribution_50$mean_attrition),
mean(sampling_distribution_500$mean_attrition)
)
)
# Population and sampling distribution variation
data.frame(
sample_size = c("Population", 5, 50, 500),
mean_mean_attrition = c(
sd(attrition_pop$Attrition == "Yes"),
sd(sampling_distribution_5$mean_attrition),
sd(sampling_distribution_50$mean_attrition),
sd(sampling_distribution_500$mean_attrition)
)
)
Bootstrapping
Bootstrapping
- Sampling without replacement
- Sampling with replacement - Resampling
- Why sample with replacement?
- Think of the coffee_ratings data as being a sample of a larger
population of all coffees
- Think about each coffee in our sample as being representative of
many different coffees that we don’t have in our sample, but do exist in
the population
- Sampling with replacement is a proxy for including different members
of these groups in our sample
- Bootstrapping - Building up a theoretical
population from your sample
- The opposite of sampling from a population
- Sampling - Going from a population to a smaller
sample
- Bootstrapping use case
- Develop understanding of sampling variability using a single sample
- Which is important in cases where you aren’t able to sample multiple
times from a population to create a sampling distribution as you’ve
seen
- Bootstrapping process
- Make a resample of the same size as the original sample
- Calculate the statistic of interest for this bootstrap sample
- Repeat steps 1 and 2 many times
- The resulting statistics are called bootstrap
statistics and when viewed to see their variability a bootstrap
distribution
# Bootstrapping
# Coffee data preparation
coffee_focus <- coffee_ratings %>%
select(variety, country_of_origin, flavor) %>%
rowid_to_column()
glimpse(coffee_focus)
## Rows: 1,339
## Columns: 4
## $ rowid <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ variety <chr> NA, "Other", "Bourbon", NA, "Other", NA, "Other", NA…
## $ country_of_origin <chr> "Ethiopia", "Ethiopia", "Guatemala", "Ethiopia", "Et…
## $ flavor <dbl> 8.83, 8.67, 8.50, 8.58, 8.50, 8.42, 8.50, 8.33, 8.67…
# Resampling with slice_sample()
coffee_resamp <- coffee_focus %>%
slice_sample(prop = 1, replace = TRUE)
# Repeated coffees
coffee_resamp %>% count(rowid, sort = TRUE)
# Missing coffees
coffee_resamp %>%
summarize(
coffees_included = n_distinct(rowid),
coffees_not_included = n() - coffees_included
)
# Bootstrapping coffee mean flavor
# Step 3. Repeat many times
mean_flavors_1000 <- replicate(
n = 1000,
expr = {
coffee_focus %>%
# Step 1. Resample
slice_sample(prop = 1, replace = TRUE) %>%
# Step 2. Calculate statistic
summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>%
pull(mean_flavor)
}
)
# Bootstrap distribution histogram
bootstrap_distn <- tibble(resample_mean = mean_flavors_1000)
ggplot(bootstrap_distn, aes(resample_mean)) +
geom_histogram(binwidth = 0.0025)

Comparing sampling and bootstrap distributions
- Sample, bootstrap distribution, population means
- Sample mean and bootstrap distribution mean values are really close.
However, the true population mean is actually slightly different
- The bootstrap distribution mean is usually almost identical to the
sample mean
- It may not be a good estimate of the population mean if the original
sample wasn’t closely representative of the population
- Bootstrapping cannot correct biases due to differences between your
sample and the population
- Sample, bootstrap distribution, population standard deviation
- Standard error is the standard deviation of the statistic of
interest
- Standard deviation describes variability within a single sample
- Standard error describes variability across multiple samples of a
population
- Bootstrap standard deviation = Standard error of mean (SEM) in
certain situations
- Quantifying what variability you might expect in your sample
statistic as you go from one sample to another
- Estimated standard error is the standard deviation of the bootstrap
distribution for a sample statistic
- The bootstrap distribution standard error times the square root of
the sample size estimates the standard deviation in the population
- Although bootstrapping was poor at estimating the population mean,
it is, in general, great for estimating the population standard
deviation
- The sampling distribution mean is the best estimate of the true
population mean; the bootstrap distribution mean is closest to the
original sample mean
# Comparing sampling and bootstrap distributions
# Coffee focused subset
set.seed(19790801)
coffee_sample <- coffee_ratings %>%
select(variety, country_of_origin, flavor) %>%
rowid_to_column() %>%
slice_sample(n = 500)
glimpse(coffee_sample)
## Rows: 500
## Columns: 4
## $ rowid <int> 10, 278, 458, 622, 131, 385, 1292, 47, 904, 1020, 55…
## $ variety <chr> "Other", "Bourbon", NA, "Caturra", "Caturra", "Yello…
## $ country_of_origin <chr> "Ethiopia", "Guatemala", "Colombia", "Thailand", "Co…
## $ flavor <dbl> 8.58, 7.75, 7.75, 7.50, 8.00, 7.83, 7.17, 8.08, 7.33…
# The bootstrap of mean coffee flavors
mean_flavors_1000 <- replicate(
n = 1000,
expr = coffee_sample %>%
slice_sample(prop = 1, replace = TRUE) %>%
summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>%
pull(mean_flavor)
)
bootstrap_distn <- tibble(resample_mean = mean_flavors_1000)
# Mean flavor bootstrap distribution
ggplot(bootstrap_distn, aes(resample_mean)) +
geom_histogram(binwidth = 0.0025)

# Sample, bootstrap distribution, population means
data.frame(
sample_mean = mean(coffee_sample$flavor),
# In the bootstrap distribution, each value is an estimate of the mean flavor score
# Each of these values corresponds to one potential sample mean from the theoretical population
# If we take the mean of those means, we get our best guess of the population mean
bootstrap_mean = mean(bootstrap_distn$resample_mean), # Estimated population mean
population_mean = mean(coffee_ratings$flavor)
)
# Sample, bootstrap distribution, population standard deviation
data.frame(
sample_sd = sd(coffee_sample$flavor),
bootstrap_sd = sd(bootstrap_distn$resample_mean),
population_sd = sd(coffee_ratings$flavor),
estimated_population_sd = sd(bootstrap_distn$resample_mean) * sqrt(500)
)
Confidence intervals
- “Values within one standard deviation of the mean” includes a large
number of values from each of these distributions
- We’ll define a related concept called a confidence
interval
- E.g. Predicting the weather
- Point estimate = 47 °F (8.3 °C)
- 40 to 54 °F is a confidence interval
- Sometimes written as
47 °F (40 °F, 54 °F) or
47 °F [40 °F, 54 °F] or 47 ± 7 °F
- 7 °F is the margin of error
- Method
- Mean plus or minus one standard deviation
- Quantile method
- Standard error method
- Inverse cumulative distribution function
- PDF: The bell curve
- CDF: Integrate to get area under bell curve
- Inv. CDF: Flip x and y axes
# Confidence intervals
mean_flavors_1000 <- replicate(
n = 1000,
expr = coffee_sample %>%
slice_sample(prop = 1, replace = TRUE) %>%
summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>%
pull(mean_flavor)
)
coffee_boot_distn <- tibble(resample_mean = mean_flavors_1000)
# Mean of the resamples
coffee_boot_distn %>%
summarize(
mean_resample_mean = mean(resample_mean),
mean_minus_1sd = mean_resample_mean - sd(resample_mean),
mean_plus_1sd = mean_resample_mean + sd(resample_mean),
lower = quantile(resample_mean, 0.025),
upper = quantile(resample_mean, 0.975)
)
ggplot(coffee_boot_distn, aes(resample_mean)) +
geom_histogram(binwidth = 0.002) +
# Mean
geom_vline(aes(xintercept = mean(resample_mean)), color = "blue") +
geom_text(aes(mean(resample_mean), 0, label = round(mean(resample_mean), 2)), vjust=1) +
# Mean plus or minus one standard deviation
# Confidence interval of one standard deviation doesn't cover a lot of the distribution
geom_vline(aes(xintercept = mean(resample_mean) - sd(resample_mean)), color = "green") +
geom_text(aes(mean(mean(resample_mean) - sd(resample_mean)), 0, label = round(mean(resample_mean) - sd(resample_mean), 2)), vjust=1) +
geom_vline(aes(xintercept = mean(resample_mean) + sd(resample_mean)), color = "green") +
geom_text(aes(mean(resample_mean) + sd(resample_mean), 0, label = round(mean(resample_mean) + sd(resample_mean), 2)), vjust=1) +
# Quantile method for confidence intervals
geom_vline(aes(xintercept = quantile(resample_mean, 0.025)), color = "red") +
geom_text(aes(quantile(resample_mean, 0.025), 0, label = round(quantile(resample_mean, 0.025), 2)), vjust=1) +
geom_vline(aes(xintercept = quantile(resample_mean, 0.975)), color = "red") +
geom_text(aes(quantile(resample_mean, 0.975), 0, label = round(quantile(resample_mean, 0.975), 2)), vjust=1)

# Inverse cumulative distribution function
normal_inv_cdf <- tibble(p = seq(-0.001, 0.999, 0.001), inv_cdf = qnorm(p))
## Warning in qnorm(p): NaNs produced
ggplot(normal_inv_cdf, aes(p, inv_cdf)) +
geom_line() +
geom_point(aes(0.025, qnorm(0.025)), colour="red") +
geom_text(aes(0.125, qnorm(0.025), label = paste0("(", 0.025, ", ", round(qnorm(0.025), 2), ")"))) +
geom_point(aes(0.975, qnorm(0.975)), colour="red") +
geom_text(aes(0.875, qnorm(0.975), label = paste0("(", 0.975, ", ", round(qnorm(0.975), 2), ")")))
## Warning: Removed 1 row containing missing values (`geom_line()`).

# Standard error method for confidence interval
coffee_boot_distn %>%
summarize(point_estimate = mean(resample_mean),
std_error = sd(resample_mean),
lower = qnorm(0.025, point_estimate, std_error),
upper = qnorm(0.975, point_estimate, std_error)
)
The most important things
- The standard deviation of the sampling distribution (a.k.a. the
standard error) of a statistic is well-approximated by the standard
deviation of the bootstrap distribution of a statistic
- When calculating confidence intervals, it’s OK to assume that
bootstrap distributions are approximately normally distributed