R Notebook 05 - Sampling in R

# Load the packages
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(fst)))
suppressMessages(suppressWarnings(library(tibble)))

# Read in the data files
coffee_ratings <- read.fst("coffee_ratings_full.fst")
spotify_population <- read.fst("spotify_2000_2020.fst")
attrition_pop <- read.fst("attrition.fst")

Sampling basics

Sampling and point estimates

Population
- The complete dataset
- Typically don’t know what the whole population is
- Population parameter - A calculation made on the population dataset
Sample
- The subset of data you calculate on
- Point estimate (sample statistic) - A calculation made on the sample dataset

# Sampling and point estimates
pts_vs_flavor_pop <- coffee_ratings %>% 
  select(total_cup_points, flavor)
dim(pts_vs_flavor_pop)

## [1] 1339    2

pts_vs_flavor_samp <- coffee_ratings %>% 
  select(total_cup_points, flavor) %>% 
  slice_sample(n = 10) # slice_sample() for data frames
dim(pts_vs_flavor_samp)

## [1] 10  2

cup_points_samp <- sample(coffee_ratings$total_cup_points, size = 10) # sample() for vectors
cup_points_samp

##  [1] 75.58 81.50 79.00 83.50 86.42 87.25 84.08 81.58 81.58 84.67

# Population parameter
mean(pts_vs_flavor_pop$total_cup_points)

## [1] 82.08985

pts_vs_flavor_pop %>% 
  summarise(mean_flavor = mean(flavor))

# Point estimate
mean(cup_points_samp)

## [1] 82.516

pts_vs_flavor_samp %>% 
  summarise(mean_flavor = mean(flavor))

# Simple sampling with dplyr
spotify_sample <- spotify_population %>% 
  slice_sample(n = 1000)

mean_dur_pop <- spotify_population %>% summarize(mean_dur = mean(duration_minutes))
mean_dur_samp <- spotify_sample %>% summarize(mean_dur = mean(duration_minutes))

mean_dur_pop

mean_dur_samp

# Simple sampling with base-R
loudness_pop <- spotify_population$loudness
loudness_samp <- sample(loudness_pop, size = 100)

sd_loudness_pop <- sd(loudness_pop)
sd_loudness_samp <- sd(loudness_samp)

sd_loudness_pop

## [1] 4.524076

sd_loudness_samp

## [1] 4.190513

Convenience sampling

Convenience sampling
- Sample not representative of population, causing sample bias
- Collecting data by the easiest method is called convenience sampling

# Convenience sampling coffee ratings
coffee_ratings %>% 
  summarize(mean_cup_points = mean(total_cup_points))

coffee_ratings_first10 <- coffee_ratings %>% 
  slice_head(n = 10)

coffee_ratings_first10 %>% 
  summarize(mean_cup_points = mean(total_cup_points))

# Visualizing selection bias
g1 <- coffee_ratings %>% 
  ggplot(aes(x = total_cup_points)) +
    geom_histogram(binwidth = 2) +
    labs(title = "Population")

g2 <- coffee_ratings_first10 %>% 
  ggplot(aes(x = total_cup_points)) +
    geom_histogram(binwidth = 2) +
    xlim(59, 91) +
    labs(title = "Convenience sampling")

g3 <- coffee_ratings %>% 
  slice_sample(n = 10) %>% 
  ggplot(aes(x = total_cup_points)) +
    geom_histogram(binwidth = 2) +
    xlim(59, 91) +
    labs(title = "Random sampling")

grid.arrange(g1, g2, g3, nrow = 1)

Random number generation

True random numbers
- Generated from physical processes, like flipping coins
  - Hotbits uses radioactive decay
  - RANDOM.ORG uses atmospheric noise
  - Available in R via the random package
  - True randomness is slow and expensive
Pseudo-random number generation
- Next “random” number calculated from previous “random” number
- The first “random” number calculated from a seed
- If you start from the same seed value, all future random numbers will be the same
- Random number generating functions
  - Generating pseudo-random numbers from a set of values
    - sample()
    - slice_sample()
  - Functions generate random numbers that follow a statistical distribution
    - Name beginning with “r”
    - The first argument is the number of numbers to generate
    - Other arguments are distribution-specific

function	distribution	function	distribution	function	distribution
rbeta	Beta	rgeom	Geometric	rsignrank	Wilcoxon signed rank
rbinom	Binomial	rhyper	Hypergeometric	rt	t
rcauchy	Cauchy	rlnorm	Lognormal	runif	Uniform
rchisq	Chi-squared	rlogis	Logistic	rweibull	Weibull
rexp	Exponential	rnbinom	Negative binomial	rwilcox	Wilcoxon rank sum
rf	F	rnorm	Normal
rgamma	Gamma	rpois	Poisson

# Visualizing random numbers
randoms <- data.frame(beta = rbeta(5000, shape1 = 2, shape2 = 2))

ggplot(randoms, aes(beta)) +
  geom_histogram(binwidth = 0.1)

# Random numbers seeds
set.seed(20000229)
rnorm(5)

## [1] -1.6538464 -0.4028384 -0.1654293 -0.0733821  0.5171339

rnorm(5)

## [1]  1.9078109  0.3791112 -1.4986993  1.6252191  0.6933334

set.seed(20000229)
rnorm(5)

## [1] -1.6538464 -0.4028384 -0.1654293 -0.0733821  0.5171339

rnorm(5)

## [1]  1.9078109  0.3791112 -1.4986993  1.6252191  0.6933334

set.seed(20041004)
rnorm(5)

## [1] -0.65474142 -0.78542098 -0.01516189  0.15143890  0.52847192

rnorm(5)

## [1]  0.7475302  0.9743212  0.1738200 -0.7809440 -0.9304514

args(rnorm)

## function (n, mean = 0, sd = 1) 
## NULL

# Generating random numbers
n_numbers = 5000

randoms <- data.frame(
  uniform = runif(n_numbers, min = -3, max = 3),
  normal = rnorm(n_numbers, mean = 5, sd = 2)
)

gunif <- ggplot(randoms, aes(uniform)) +
  geom_histogram(binwidth = 0.25)

gnorm <- ggplot(randoms, aes(normal)) +
  geom_histogram(binwidth = 0.5)

grid.arrange(gunif, gnorm, nrow = 1)

Random sampling method

Simple random sampling

# Simple random sampling
set.seed(19000113)
coffee_ratings %>% slice_sample(n = 5)

set.seed(1)
attrition_samp <- attrition_pop %>% 
  rowid_to_column() %>% 
  slice_sample(n = 200)
attrition_samp

Systematic sampling

Systematic random sampling - Samples the population at regular intervals
- Only safe if a plot has no pattern in the scatter plot. That is, it just looks like noise
- Shuffling rows + systematic sampling is the same as simple random sampling

# Systematic random sampling
# Adding a row ID column
coffee_ratings_rowid <- coffee_ratings %>% 
  rowid_to_column()

# Determining how big the interval should be between each row to include in the sample
sample_size <- 5
pop_size <- nrow(coffee_ratings_rowid)
interval <- pop_size %/% sample_size

# Filtering for every two hundred and sixty seventh row
row_indexes <- seq_len(sample_size) * interval
coffee_ratings_rowid %>% slice(row_indexes)

g1 <- coffee_ratings_rowid %>% 
  ggplot(aes(x = rowid, y = aftertaste)) +
  geom_point() +
  geom_smooth() +
  ylim(6, 9)

# Making systematic sampling safe
# Randomize the row order before sampling
shuffled <- coffee_ratings_rowid %>% 
  slice_sample(prop = 1) %>% 
  select(-rowid) %>% 
  rowid_to_column()

g2 <- shuffled %>% 
  ggplot(aes(x = rowid, y = aftertaste)) +
  geom_point() +
  geom_smooth() +
  ylim(6, 9)

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).

sample_size <- 200
pop_size <- nrow(attrition_pop)
interval <- pop_size %/% sample_size

row_indexes <- seq_len(sample_size) * interval

attrition_sys_samp <- attrition_pop %>% 
  rowid_to_column() %>% 
  slice(row_indexes)
attrition_sys_samp

attrition_pop_id <- attrition_pop %>% 
  rowid_to_column()

g1 <- ggplot(attrition_pop_id, aes(rowid, YearsAtCompany)) +
  geom_point() +
  geom_smooth()

attrition_shuffled <- attrition_pop %>% 
  slice_sample(prop = 1) %>% 
  rowid_to_column()

g2 <- ggplot(attrition_shuffled, aes(rowid, YearsAtCompany)) +
  geom_point() +
  geom_smooth()

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Stratified sampling

Stratified sampling - A technique that allows to sample a population that contains subgroups
- Weighted random sampling

# Stratified sampling
# Coffees by country
top_counts <- coffee_ratings %>% 
  count(country_of_origin, sort = TRUE) %>% 
  head()
top_counts

# Filtering for 6 countries
top_counted_countries <- c("Mexico", "Colombia", "Guatemala", "Brazil", "Taiwan", "United States (Hawaii)")
coffee_ratings_top <- coffee_ratings %>% 
  filter(country_of_origin %in% top_counted_countries)

coffee_ratings_top <- coffee_ratings %>% 
  semi_join(top_counts)

## Joining with `by = join_by(country_of_origin)`

# Counts of a simple random sample
coffee_ratings_samp <- coffee_ratings_top %>% 
  slice_sample(prop = 0.1)

# Population
coffee_ratings_top %>% 
  count(country_of_origin, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

# 10% sample - Disproportion
coffee_ratings_samp %>% 
  count(country_of_origin, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

# Proportional stratified sampling
coffee_ratings_strat <- coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  slice_sample(prop = 0.1) %>% 
  ungroup()

coffee_ratings_strat %>% 
  count(country_of_origin, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

# Equal counts stratified sampling
coffee_ratings_eq <- coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  slice_sample(n = 15) %>% 
  ungroup()

coffee_ratings_eq %>% 
  count(country_of_origin, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

# Weighted random sampling
coffee_ratings_weight <- coffee_ratings_top %>% 
  mutate(weight = ifelse(country_of_origin == "Taiwan", 2, 1)) %>% 
  slice_sample(prop = 0.1, weight_by = weight)

coffee_ratings_weight %>% 
  count(country_of_origin, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

# Stratified sampling
# Proportional stratified sampling
attrition_pop %>% 
  count(Education, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))

attrition_strat <- attrition_pop %>% 
  group_by(Education) %>% 
  slice_sample(prop = 0.4) %>% 
  ungroup()

education_counts_strat <- attrition_strat %>% 
  count(Education, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))
education_counts_strat

# Equal counts stratified sampling
attrition_eq <- attrition_pop %>%
  group_by(Education) %>% 
  slice_sample(n = 30) %>%
  ungroup()

education_counts_eq <- attrition_eq %>% 
  count(Education, sort = TRUE) %>% 
  mutate(percent = 100 * n / sum(n))
education_counts_eq

# Weighted sampling
g1 <- ggplot(attrition_pop, aes(YearsAtCompany)) +
  geom_histogram(binwidth = 1)

# Sample 400 employees weighted by YearsAtCompany
attrition_weight <- slice_sample(attrition_pop, n = 400, weight_by = YearsAtCompany)

g2 <- ggplot(attrition_weight, aes(YearsAtCompany)) +
  geom_histogram(binwidth = 1)

grid.arrange(g1, g2, nrow = 1)

mean(attrition_pop$YearsAtCompany)

## [1] 7.008163

mean(attrition_weight$YearsAtCompany) # The weighted simple random sample is biased

## [1] 10.76

Cluster sampling

Stratified vs Cluster sampling
- Stratified sampling
  - Split the population into subgroups
  - Use simple random sampling on every subgroup
  - Need to collect data from every subgroup, can be expensive
  - Cheaper alternative - Cluster sampling
- Cluster sampling
  - Use simple random sampling to pick some subgroups
  - Use simple random sampling on only those subgroups
Multistage sampling
- Cluster sampling is a type of multistage sampling
- You can have > 2 stages
- Countrywide surveys may sample states, counties, cities, and neighborhoods

# Varieties of coffee
varieties_pop <- unique(coffee_ratings$variety)

# Stage 1: sampling for subgroups
varieties_samp <- sample(varieties_pop, size = 3)

# Stage 2: sampling each group
coffee_ratings %>% 
  filter(variety %in% varieties_samp) %>% 
  group_by(variety) %>% 
  slice_sample(n = 5) %>% 
  ungroup()

job_roles_pop <- unique(attrition_pop$JobRole)
job_roles_samp <- sample(job_roles_pop, size = 4)

attrition_filtered <- filter(attrition_pop, JobRole %in% job_roles_samp)

attrition_clus <- attrition_filtered %>% 
  group_by(JobRole) %>% 
  slice_sample(n = 10)
attrition_clus

Comparing sampling methods

# Comparing sampling methods
# Review of sampling techniques
# Setup
top_counted_countries <- c("Mexico", "Colombia", "Guatemala", "Brazil", "Taiwan", "United States (Hawaii)")
coffee_ratings_top <- coffee_ratings %>% 
  filter(country_of_origin %in% top_counted_countries)

# Simple random sampling
coffee_ratings_srs <- coffee_ratings_top %>% 
  slice_sample(prop = 1/3)

# Stratified sampling
coffee_ratings_strat <- coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  slice_sample(prop = 1/3) %>% 
  ungroup()

# Cluster sampling
top_countries_samp <- sample(top_counted_countries, size = 2)

coffee_ratings_clust <- coffee_ratings_top %>% 
  filter(country_of_origin %in% top_countries_samp) %>% 
  group_by(country_of_origin) %>% 
  slice_sample(n = nrow(coffee_ratings_top) %/% 6) %>% 
  ungroup()

# Calculating mean cup points
# The simple and stratified sample means are really close to the population mean
# Cluster sampling isn't quite as close, but that's typical
# Cluster sampling is designed to give you an answer that's almost as good, while using less data

# Population mean
coffee_ratings_top %>% 
  summarize(mean_points = mean(total_cup_points))

# Point estimates of the mean - sample mean
# Simple random sample
coffee_ratings_srs %>% 
  summarize(mean_points = mean(total_cup_points))

# Stratified sample
coffee_ratings_strat %>% 
  summarize(mean_points = mean(total_cup_points))

# Cluster sample
coffee_ratings_clust %>% 
  summarize(mean_points = mean(total_cup_points))

# Mean cup points by country
# Each of the sample means is pretty close to the population mean
# The same is true of the sample means from the stratified technique
# With cluster sampling, while the sample means are pretty close to the population mean, the obvious limitation is that you only get values for the two countries that were included in the sample
# If the mean cup points for each country is an important metric in your analysis, cluster sampling would be a bad idea

# Population
coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# Simple random sample
coffee_ratings_srs %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# Stratified sample
coffee_ratings_strat %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# Cluster sample
coffee_ratings_clust %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

attrition_srs <- attrition_pop %>% 
  slice_sample(prop = 1/4)

attrition_strat <- attrition_pop %>% 
  group_by(RelationshipSatisfaction) %>% 
  slice_sample(prop = 1/4) %>% 
  ungroup

satisfaction_unique <- unique(attrition_pop$RelationshipSatisfaction)
satisfaction_samp <- sample(satisfaction_unique, size = 2)
attrition_clust <- attrition_pop %>% 
  filter(RelationshipSatisfaction %in% satisfaction_samp) %>% 
  group_by(RelationshipSatisfaction) %>% 
  slice_sample(n = nrow(attrition_pop) %/% 4) %>% 
  ungroup

# Population
mean_attrition_pop <- attrition_pop %>% 
  group_by(RelationshipSatisfaction) %>% 
  summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_pop

# Simple random sample 
mean_attrition_srs <- attrition_srs %>% 
  group_by(RelationshipSatisfaction) %>% 
  summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_srs

# Stratified sample 
mean_attrition_strat <- attrition_strat %>% 
  group_by(RelationshipSatisfaction) %>% 
  summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_strat

# Cluster sample 
mean_attrition_clust <- attrition_clust %>% 
  group_by(RelationshipSatisfaction) %>% 
  summarise(mean_attrition = mean(Attrition == "Yes"))
mean_attrition_clust

Sampling distribution

Relative error of point estimates

Sample size - The number of observations, that is, the number of rows, in the sample
In general, larger sample sizes will give us more accurate results. The relative error decreases as the sample size increases

# Relative error vs. sample size
relative_errors <- function(sample_size){
  # Population parameter
  population_mean <- coffee_ratings %>% 
    summarize(mean_points = mean(total_cup_points)) %>% 
    pull(mean_points)

  # Point estimate
  sample_mean <- coffee_ratings %>% 
    slice_sample(n = sample_size) %>% 
    summarize(mean_points = mean(total_cup_points)) %>% 
    pull(mean_points)

  # Relative error as a percentage
  relative_error = 100 * abs(population_mean - sample_mean) / population_mean

  return(relative_error)
}

sample_size = c(1:1500)
errors <- data.frame(sample_size = sample_size, relative_error = sapply(sample_size, relative_errors))

# The black line is really noisy, particularly with small sample sizes
# The downward slope in the trend line is quite steep to begin with. Further to the right in the plot, the slope is less steep
# At the far right of the plot, where the sample size is the whole population, the error decreases to zero
ggplot(errors, aes(sample_size, relative_error)) +
  geom_line() +
  geom_smooth(method = "loess")

## `geom_smooth()` using formula = 'y ~ x'

# Calculating relative errors
mean_attrition_pop <- mean(attrition_pop$Attrition == "Yes")

attrition_srs10 <- slice_sample(attrition_pop, n = 10)
mean_attrition_srs10 <- mean(attrition_srs10$Attrition == "Yes")
rel_error_pct10 <- 100 * abs(mean_attrition_pop - mean_attrition_srs10) / mean_attrition_pop
rel_error_pct10

## [1] 24.05063

attrition_srs100 <- slice_sample(attrition_pop, n = 100)
mean_attrition_srs100 <- mean(attrition_srs100$Attrition == "Yes")
rel_error_pct100 <- 100 * abs(mean_attrition_pop - mean_attrition_srs100) / mean_attrition_pop
rel_error_pct100

## [1] 30.25316

Sampling distribution

Sampling distribution - A distribution of several replicates of point estimates
- When you calculate a point estimate such as a sample mean, the value you calculate depends on the rows that were included in the sample. That means that there is some randomness in the answer. In order to quantify the variation caused by this randomness, you can create many samples and calculate the sample mean (or other statistic) for each sample
- Decreasing the sample size, the range of the results is broader
- Increasing the sample size results in a narrower range
- Increasing the number of replicates does not affect the relative error of the sample means, it does result in a more consistent shape to the distribution

# Sampling distribution
mean_cup_points_5 <- replicate(
  n = 1000,
  expr = coffee_ratings %>% 
    slice_sample(n = 5) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_5 <- tibble(sample_mean = mean_cup_points_5)

mean_cup_points_30 <- replicate(
  n = 1000,
  expr = coffee_ratings %>% 
    slice_sample(n = 30) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_30 <- tibble(sample_mean = mean_cup_points_30)

mean_cup_points_150 <- replicate(
  n = 1000,
  expr = coffee_ratings %>% 
    slice_sample(n = 150) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_150 <- tibble(sample_mean = mean_cup_points_150)

# Distribution of sample means
g1 <- ggplot(sample_means_5, aes(sample_mean)) +
  geom_histogram(binwidth = 0.1) +
  labs(title = "Sample Size 5")

g2 <- ggplot(sample_means_30, aes(sample_mean)) +
  geom_histogram(binwidth = 0.1) +
  labs(title = "Sample Size 30")

g3 <- ggplot(sample_means_150, aes(sample_mean)) +
  geom_histogram(binwidth = 0.1) +
  labs(title = "Sample Size 150")

grid.arrange(g1, g2, g3, nrow = 1)

# Replicating samples
mean_attritions_rep_100 <- replicate(
  n = 100,
  attrition_pop %>% 
    slice_sample(n = 20) %>% 
    summarize(mean_attrition = mean(Attrition == "Yes")) %>% 
    pull(mean_attrition)
)
sample_means_rep_100 <- tibble(sample_mean = mean_attritions_rep_100)

mean_attritions_rep_1000 <- replicate(
  n = 1000,
  attrition_pop %>% 
    slice_sample(n = 20) %>% 
    summarize(mean_attrition = mean(Attrition == "Yes")) %>% 
    pull(mean_attrition)
)
sample_means_rep_1000 <- tibble(sample_mean = mean_attritions_rep_1000)


g1 <- ggplot(sample_means_rep_100, aes(sample_mean)) +
  geom_histogram(binwidth = 0.05)

g2 <- ggplot(sample_means_rep_1000, aes(sample_mean)) +
  geom_histogram(binwidth = 0.05)

grid.arrange(g1, g2, nrow = 1)

Approximate vs. Actual sampling distributions

Exact sampling distribution - Calculate exactly, rather than using an approximation
Sampling distribution - The distribution of a sample statistic

# Exact sampling distribution
dice <- expand_grid(
  die1 = 1:6,
  die2 = 1:6,
  die3 = 1:6,
  die4 = 1:6
  ) %>% 
  mutate(mean_roll = (die1 + die2 + die3 + die4) / 4)

g1 <- ggplot(dice, aes(factor(mean_roll))) +
  geom_bar()

# Approximate sampling distribution
# The number of outcomes increases fast
# Need to rely on approximations
sample_means_1000 <- replicate(
  n = 1000,
  expr = {
    four_rolls <- sample(1:6, size = 4, replace = TRUE)
    mean(four_rolls)
  }
)

sample_means <- tibble(sample_mean = sample_means_1000)

g2 <- ggplot(sample_means, aes(factor(sample_mean))) +
  geom_bar()

grid.arrange(g1, g2, nrow = 1)

# Exact sampling distribution
dice <- expand_grid(
  die1 = 1:8,
  die2 = 1:8,
  die3 = 1:8,
  die4 = 1:8,
  die5 = 1:8
  ) %>% 
  mutate(mean_roll = (die1 + die2 + die3 + die4 + die5) / 5)

g1 <- ggplot(dice, aes(factor(mean_roll))) +
  geom_bar()

# Approximate sampling distribution
sample_means_1000 <- replicate(
  n = 1000,
  expr = {
    five_rolls <- sample(1:8, size = 5, replace = TRUE)
    mean(five_rolls)
  }
)

sample_means <- tibble(sample_mean = sample_means_1000)

g2 <- ggplot(sample_means, aes(factor(sample_mean))) +
  geom_bar()

grid.arrange(g1, g2, nrow = 1)

Consequences of the central limit theorem

Averages of independent samples have approximately normal distributions
- As the sample size increases,
  - The distribution of the averages gets closer to being normally distributed
  - The width of the sampling distribution gets narrower
One other consequence of the central limit theorem is that if you divide the population mean by the square root of the sample size, you get an estimate of the standard deviation of the sampling distribution

# Sampling distribution
mean_cup_points_5 <- replicate(
  n = 5000,
  expr = coffee_ratings %>% 
    slice_sample(n = 5) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_5 <- tibble(sample_mean = mean_cup_points_5)

mean_cup_points_20 <- replicate(
  n = 5000,
  expr = coffee_ratings %>% 
    slice_sample(n = 20) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_20 <- tibble(sample_mean = mean_cup_points_20)

mean_cup_points_80 <- replicate(
  n = 5000,
  expr = coffee_ratings %>% 
    slice_sample(n = 80) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_80 <- tibble(sample_mean = mean_cup_points_80)

mean_cup_points_320 <- replicate(
  n = 5000,
  expr = coffee_ratings %>% 
    slice_sample(n = 320) %>% 
    summarize(mean_cup_points = mean(total_cup_points)) %>% 
    pull(mean_cup_points)
)
sample_means_320 <- tibble(sample_mean = mean_cup_points_320)

# Distribution of sample means
g1 <- ggplot(sample_means_5, aes(sample_mean)) +
  geom_histogram() +
  labs(title = "Sample Size 5")

g2 <- ggplot(sample_means_20, aes(sample_mean)) +
  geom_histogram() +
  labs(title = "Sample Size 20")

g3 <- ggplot(sample_means_80, aes(sample_mean)) +
  geom_histogram() +
  labs(title = "Sample Size 80")

g4 <- ggplot(sample_means_320, aes(sample_mean)) +
  geom_histogram() +
  labs(title = "Sample Size 320")

grid.arrange(g1, g2, g3, g4, nrow = 1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Population & sampling distribution means
data.frame(
  sample_size = c("Population", 5, 20, 80, 320),
  mean_sample_mean = c(
    mean(coffee_ratings$total_cup_points),
    mean(mean_cup_points_5),
    mean(mean_cup_points_20),
    mean(mean_cup_points_80),
    mean(mean_cup_points_320)
  )
)

# Population & sampling distribution standard deviations
data.frame(
  sample_size = c("Population", 5, 20, 80, 320),
  sd_sample_mean = c(
    sd(coffee_ratings$total_cup_points),
    sd(mean_cup_points_5),
    sd(mean_cup_points_20),
    sd(mean_cup_points_80),
    sd(mean_cup_points_320)
  )
)

# Population & sampling distribution means
mean_attritions_5 <- replicate(
  n = 1000,
  attrition_pop %>% 
    slice_sample(n = 5) %>% 
    summarise(mean_attrition = mean(Attrition == "Yes")) %>% 
    pull(mean_attrition)
)
sampling_distribution_5 <- tibble(
  replicate = 1:1000,
  mean_attrition = mean_attritions_5
)

mean_attritions_50 <- replicate(
  n = 1000,
  attrition_pop %>% 
    slice_sample(n = 50) %>% 
    summarise(mean_attrition = mean(Attrition == "Yes")) %>% 
    pull(mean_attrition)
)
sampling_distribution_50 <- tibble(
  replicate = 1:1000,
  mean_attrition = mean_attritions_50
)

mean_attritions_500 <- replicate(
  n = 1000,
  attrition_pop %>% 
    slice_sample(n = 500) %>% 
    summarise(mean_attrition = mean(Attrition == "Yes")) %>% 
    pull(mean_attrition)
)
sampling_distribution_500 <- tibble(
  replicate = 1:1000,
  mean_attrition = mean_attritions_500
)

data.frame(
  sample_size = c("Population", 5, 50, 500),
  mean_mean_attrition = c(
    mean(attrition_pop$Attrition == "Yes"),
    mean(sampling_distribution_5$mean_attrition),
    mean(sampling_distribution_50$mean_attrition),
    mean(sampling_distribution_500$mean_attrition)
  )
)

# Population and sampling distribution variation
data.frame(
  sample_size = c("Population", 5, 50, 500),
  mean_mean_attrition = c(
    sd(attrition_pop$Attrition == "Yes"),
    sd(sampling_distribution_5$mean_attrition),
    sd(sampling_distribution_50$mean_attrition),
    sd(sampling_distribution_500$mean_attrition)
  )
)

Bootstrapping

Sampling without replacement
Sampling with replacement - Resampling
- Why sample with replacement?
  - Think of the coffee_ratings data as being a sample of a larger population of all coffees
  - Think about each coffee in our sample as being representative of many different coffees that we don’t have in our sample, but do exist in the population
  - Sampling with replacement is a proxy for including different members of these groups in our sample
Bootstrapping - Building up a theoretical population from your sample
- The opposite of sampling from a population
  - Sampling - Going from a population to a smaller sample
- Bootstrapping use case
  - Develop understanding of sampling variability using a single sample
    - Which is important in cases where you aren’t able to sample multiple times from a population to create a sampling distribution as you’ve seen
- Bootstrapping process
  - Make a resample of the same size as the original sample
  - Calculate the statistic of interest for this bootstrap sample
  - Repeat steps 1 and 2 many times
  - The resulting statistics are called bootstrap statistics and when viewed to see their variability a bootstrap distribution

# Bootstrapping
# Coffee data preparation
coffee_focus <- coffee_ratings %>% 
  select(variety, country_of_origin, flavor) %>% 
  rowid_to_column()

glimpse(coffee_focus)

## Rows: 1,339
## Columns: 4
## $ rowid             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ variety           <chr> NA, "Other", "Bourbon", NA, "Other", NA, "Other", NA…
## $ country_of_origin <chr> "Ethiopia", "Ethiopia", "Guatemala", "Ethiopia", "Et…
## $ flavor            <dbl> 8.83, 8.67, 8.50, 8.58, 8.50, 8.42, 8.50, 8.33, 8.67…

# Resampling with slice_sample()
coffee_resamp <- coffee_focus %>% 
  slice_sample(prop = 1, replace = TRUE)

# Repeated coffees
coffee_resamp %>% count(rowid, sort = TRUE)

# Missing coffees
coffee_resamp %>% 
  summarize(
    coffees_included = n_distinct(rowid),
    coffees_not_included = n() - coffees_included
  )

# Bootstrapping coffee mean flavor
# Step 3. Repeat many times
mean_flavors_1000 <- replicate(
  n = 1000,
  expr = {
    coffee_focus %>% 
      # Step 1. Resample
      slice_sample(prop = 1, replace = TRUE) %>% 
      # Step 2. Calculate statistic
      summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>% 
      pull(mean_flavor)
  }
)

# Bootstrap distribution histogram
bootstrap_distn <- tibble(resample_mean = mean_flavors_1000)

ggplot(bootstrap_distn, aes(resample_mean)) +
  geom_histogram(binwidth = 0.0025)

Comparing sampling and bootstrap distributions

Sample, bootstrap distribution, population means
- Sample mean and bootstrap distribution mean values are really close. However, the true population mean is actually slightly different
  - The bootstrap distribution mean is usually almost identical to the sample mean
  - It may not be a good estimate of the population mean if the original sample wasn’t closely representative of the population
  - Bootstrapping cannot correct biases due to differences between your sample and the population
Sample, bootstrap distribution, population standard deviation
- Standard error is the standard deviation of the statistic of interest
  - Standard deviation describes variability within a single sample
  - Standard error describes variability across multiple samples of a population
  - Bootstrap standard deviation = Standard error of mean (SEM) in certain situations
- Quantifying what variability you might expect in your sample statistic as you go from one sample to another
- Estimated standard error is the standard deviation of the bootstrap distribution for a sample statistic
- The bootstrap distribution standard error times the square root of the sample size estimates the standard deviation in the population
- Although bootstrapping was poor at estimating the population mean, it is, in general, great for estimating the population standard deviation
- The sampling distribution mean is the best estimate of the true population mean; the bootstrap distribution mean is closest to the original sample mean

# Comparing sampling and bootstrap distributions
# Coffee focused subset
set.seed(19790801)

coffee_sample <- coffee_ratings %>% 
  select(variety, country_of_origin, flavor) %>% 
  rowid_to_column() %>% 
  slice_sample(n = 500)

glimpse(coffee_sample)

## Rows: 500
## Columns: 4
## $ rowid             <int> 10, 278, 458, 622, 131, 385, 1292, 47, 904, 1020, 55…
## $ variety           <chr> "Other", "Bourbon", NA, "Caturra", "Caturra", "Yello…
## $ country_of_origin <chr> "Ethiopia", "Guatemala", "Colombia", "Thailand", "Co…
## $ flavor            <dbl> 8.58, 7.75, 7.75, 7.50, 8.00, 7.83, 7.17, 8.08, 7.33…

# The bootstrap of mean coffee flavors
mean_flavors_1000 <- replicate(
  n = 1000,
  expr = coffee_sample %>% 
  slice_sample(prop = 1, replace = TRUE) %>% 
  summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>% 
  pull(mean_flavor)
)
bootstrap_distn <- tibble(resample_mean = mean_flavors_1000)

# Mean flavor bootstrap distribution
ggplot(bootstrap_distn, aes(resample_mean)) +
  geom_histogram(binwidth = 0.0025)

# Sample, bootstrap distribution, population means
data.frame(
  sample_mean = mean(coffee_sample$flavor),

  # In the bootstrap distribution, each value is an estimate of the mean flavor score
  # Each of these values corresponds to one potential sample mean from the theoretical population
  # If we take the mean of those means, we get our best guess of the population mean
  bootstrap_mean = mean(bootstrap_distn$resample_mean), # Estimated population mean

  population_mean = mean(coffee_ratings$flavor)
)

# Sample, bootstrap distribution, population standard deviation
data.frame(
  sample_sd = sd(coffee_sample$flavor),
  bootstrap_sd = sd(bootstrap_distn$resample_mean),
  population_sd = sd(coffee_ratings$flavor),
  estimated_population_sd = sd(bootstrap_distn$resample_mean) * sqrt(500)
)

Confidence intervals

“Values within one standard deviation of the mean” includes a large number of values from each of these distributions
We’ll define a related concept called a confidence interval
E.g. Predicting the weather
- Point estimate = 47 °F (8.3 °C)
- 40 to 54 °F is a confidence interval
- Sometimes written as 47 °F (40 °F, 54 °F) or 47 °F [40 °F, 54 °F] or 47 ± 7 °F
- 7 °F is the margin of error
Method
- Mean plus or minus one standard deviation
- Quantile method
- Standard error method
  - Inverse cumulative distribution function
    - PDF: The bell curve
    - CDF: Integrate to get area under bell curve
    - Inv. CDF: Flip x and y axes

# Confidence intervals
mean_flavors_1000 <- replicate(
  n = 1000,
  expr = coffee_sample %>% 
    slice_sample(prop = 1, replace = TRUE) %>% 
    summarize(mean_flavor = mean(flavor, na.rm = TRUE)) %>% 
    pull(mean_flavor)
  )
coffee_boot_distn <- tibble(resample_mean = mean_flavors_1000)

# Mean of the resamples
coffee_boot_distn %>% 
  summarize(
    mean_resample_mean = mean(resample_mean),
    mean_minus_1sd = mean_resample_mean - sd(resample_mean),
    mean_plus_1sd = mean_resample_mean + sd(resample_mean),
    lower = quantile(resample_mean, 0.025),
    upper = quantile(resample_mean, 0.975)
  )

ggplot(coffee_boot_distn, aes(resample_mean)) +
  geom_histogram(binwidth = 0.002) +

  # Mean
  geom_vline(aes(xintercept = mean(resample_mean)), color = "blue") +
  geom_text(aes(mean(resample_mean), 0, label = round(mean(resample_mean), 2)), vjust=1) +

  # Mean plus or minus one standard deviation
  # Confidence interval of one standard deviation doesn't cover a lot of the distribution
  geom_vline(aes(xintercept = mean(resample_mean) - sd(resample_mean)), color = "green") +
  geom_text(aes(mean(mean(resample_mean) - sd(resample_mean)), 0, label = round(mean(resample_mean) - sd(resample_mean), 2)), vjust=1) +
  geom_vline(aes(xintercept = mean(resample_mean) + sd(resample_mean)), color = "green") +
  geom_text(aes(mean(resample_mean) + sd(resample_mean), 0, label = round(mean(resample_mean) + sd(resample_mean), 2)), vjust=1) +

  # Quantile method for confidence intervals
  geom_vline(aes(xintercept = quantile(resample_mean, 0.025)), color = "red") +
  geom_text(aes(quantile(resample_mean, 0.025), 0, label = round(quantile(resample_mean, 0.025), 2)), vjust=1) +
  geom_vline(aes(xintercept = quantile(resample_mean, 0.975)), color = "red") +
  geom_text(aes(quantile(resample_mean, 0.975), 0, label = round(quantile(resample_mean, 0.975), 2)), vjust=1)

# Inverse cumulative distribution function
normal_inv_cdf <- tibble(p = seq(-0.001, 0.999, 0.001), inv_cdf = qnorm(p))

## Warning in qnorm(p): NaNs produced

ggplot(normal_inv_cdf, aes(p, inv_cdf)) +
  geom_line() +
  geom_point(aes(0.025, qnorm(0.025)), colour="red") +
  geom_text(aes(0.125, qnorm(0.025), label = paste0("(", 0.025, ", ", round(qnorm(0.025), 2), ")"))) +
  geom_point(aes(0.975, qnorm(0.975)), colour="red") +
  geom_text(aes(0.875, qnorm(0.975), label = paste0("(", 0.975, ", ", round(qnorm(0.975), 2), ")")))

## Warning: Removed 1 row containing missing values (`geom_line()`).

# Standard error method for confidence interval
coffee_boot_distn %>% 
  summarize(point_estimate = mean(resample_mean),
  std_error = sd(resample_mean),
  lower = qnorm(0.025, point_estimate, std_error),
  upper = qnorm(0.975, point_estimate, std_error)
)

The most important things

The standard deviation of the sampling distribution (a.k.a. the standard error) of a statistic is well-approximated by the standard deviation of the bootstrap distribution of a statistic
When calculating confidence intervals, it’s OK to assume that bootstrap distributions are approximately normally distributed